From fd5ec3b51ce9500e36597ac39e486a5cd40cddc4 Mon Sep 17 00:00:00 2001 From: Arthur Outhenin-Chalandre Date: Tue, 23 Nov 2021 09:30:38 +0100 Subject: [PATCH] mgr/dashboard: monitoring: refactor into ceph-mixin Mixin is a way to bundle dashboards, prometheus rules and alerts into jsonnet package. Shifting to mixin will allow easier integration with monitoring automation that some users may use. This commit moves `/monitoring/grafana/dashboards` and `/monitoring/prometheus` to `/monitoring/ceph-mixin`. Prometheus alerts was also converted to Jsonnet using an automated way (from yaml to json to jsonnet). This commit minimises any change made to the generated files and should not change neithers the dashboards nor the Prometheus alerts. In the future some configuration will also be added to jsonnet to add more functionalities to the dashboards or alerts (i.e.: multi cluster). Fixes: https://tracker.ceph.com/issues/53374 Signed-off-by: Arthur Outhenin-Chalandre (cherry picked from commit 98236e3a1d2855c95d86640645c2984efa83791f) Conflicts: monitoring/grafana/dashboards/CMakeLists.txt monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet monitoring/grafana/dashboards/tox.ini monitoring/prometheus/README.md src/test/CMakeLists.txt Trivial fixes monitoring/ceph-mixin/dashboards_out/osds-overview.json monitoring/ceph-mixin/dashboards_out/pool-overview.json monitoring/ceph-mixin/dashboards_out/rbd-details.json Include small fixes not brought in pacific yet --- CMakeLists.txt | 6 +- ceph.spec.in | 4 +- debian/rules | 2 +- monitoring/CMakeLists.txt | 1 - monitoring/ceph-mixin/.gitignore | 1 + monitoring/ceph-mixin/.pylintrc | 1 + monitoring/ceph-mixin/CMakeLists.txt | 53 + monitoring/ceph-mixin/Makefile | 24 + monitoring/ceph-mixin/README.md | 52 + monitoring/ceph-mixin/alerts.libsonnet | 3 + monitoring/ceph-mixin/config.libsonnet | 1 + monitoring/ceph-mixin/dashboards.jsonnet | 6 + .../ceph-mixin/dashboards/cephfs.libsonnet | 103 ++ .../dashboards/dashboards.libsonnet | 6 + .../ceph-mixin/dashboards/host.libsonnet | 562 ++++++ .../ceph-mixin/dashboards/osd.libsonnet | 533 ++++++ .../ceph-mixin/dashboards/pool.libsonnet | 570 +++++++ .../ceph-mixin/dashboards/rbd.libsonnet | 309 ++++ .../ceph-mixin/dashboards/rgw.libsonnet | 643 +++++++ .../ceph-mixin/dashboards/utils.libsonnet | 172 ++ .../dashboards_out}/ceph-cluster.json | 0 .../dashboards_out}/cephfs-overview.json | 2 + .../dashboards_out}/host-details.json | 9 + .../dashboards_out}/hosts-overview.json | 6 +- .../dashboards_out}/osd-device-details.json | 7 + .../dashboards_out}/osds-overview.json | 15 +- .../dashboards_out}/pool-detail.json | 4 + .../dashboards_out}/pool-overview.json | 28 +- .../dashboards_out}/radosgw-detail.json | 3 + .../dashboards_out}/radosgw-overview.json | 10 + .../radosgw-sync-overview.json | 4 + .../dashboards_out}/rbd-details.json | 15 +- .../dashboards_out}/rbd-overview.json | 6 + monitoring/ceph-mixin/jsonnetfile.json | 15 + monitoring/ceph-mixin/jsonnetfile.lock.json | 16 + monitoring/ceph-mixin/lint-jsonnet.sh | 5 + monitoring/ceph-mixin/mixin.libsonnet | 3 + .../prometheus_alerts.yaml} | 2 +- .../requirements-alerts.txt} | 0 .../requirements-grafonnet.txt | 0 .../requirements-lint.txt | 0 monitoring/ceph-mixin/test-jsonnet.sh | 31 + .../tests_alerts}/README.md | 0 .../tests_alerts}/__init__.py | 0 .../ceph-mixin/tests_alerts/settings.py | 11 + .../tests_alerts}/test_alerts.yml | 4 +- .../tests_alerts}/test_syntax.py | 0 .../tests_alerts}/test_unittests.py | 0 .../tests_alerts}/utils.py | 0 .../tests_alerts}/validate_rules.py | 11 +- .../tests_dashboards}/__init__.py | 0 .../tests_dashboards/features}/__init__.py | 0 .../features/ceph-cluster.feature | 0 .../tests_dashboards}/features/environment.py | 4 +- .../features/host-details.feature | 0 .../features/hosts_overview.feature | 0 .../features/osd-device-details.feature | 0 .../features/osds-overview.feature | 0 .../features/radosgw-detail.feature | 0 .../features/radosgw_overview.feature | 0 .../tests_dashboards}/features/self.feature | 0 .../features/steps/__init__.py | 0 .../tests_dashboards}/requirements.txt | 0 .../tests_dashboards}/util.py | 3 +- monitoring/ceph-mixin/tox.ini | 69 + monitoring/grafana/README.md | 14 - monitoring/grafana/build/Makefile | 2 +- monitoring/grafana/dashboards/.pylintrc | 1 - monitoring/grafana/dashboards/CMakeLists.txt | 38 - monitoring/grafana/dashboards/README | 28 - .../jsonnet/grafana_dashboards.jsonnet | 1510 ----------------- monitoring/grafana/dashboards/test-jsonnet.sh | 30 - monitoring/grafana/dashboards/tox.ini | 44 - monitoring/prometheus/CMakeLists.txt | 1 - monitoring/prometheus/README.md | 7 - monitoring/prometheus/tests/CMakeLists.txt | 4 - monitoring/prometheus/tests/settings.py | 2 - monitoring/prometheus/tests/tox.ini | 11 - src/pybind/mgr/dashboard/grafana.py | 2 +- src/pybind/mgr/dashboard/tox.ini | 2 +- src/test/CMakeLists.txt | 18 - 81 files changed, 3288 insertions(+), 1761 deletions(-) delete mode 100644 monitoring/CMakeLists.txt create mode 100644 monitoring/ceph-mixin/.gitignore create mode 120000 monitoring/ceph-mixin/.pylintrc create mode 100644 monitoring/ceph-mixin/CMakeLists.txt create mode 100644 monitoring/ceph-mixin/Makefile create mode 100644 monitoring/ceph-mixin/README.md create mode 100644 monitoring/ceph-mixin/alerts.libsonnet create mode 100644 monitoring/ceph-mixin/config.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards.jsonnet create mode 100644 monitoring/ceph-mixin/dashboards/cephfs.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/dashboards.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/host.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/osd.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/pool.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/rbd.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/rgw.libsonnet create mode 100644 monitoring/ceph-mixin/dashboards/utils.libsonnet rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/ceph-cluster.json (100%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/cephfs-overview.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/host-details.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/hosts-overview.json (93%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/osd-device-details.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/osds-overview.json (98%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/pool-detail.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/pool-overview.json (98%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/radosgw-detail.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/radosgw-overview.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/radosgw-sync-overview.json (99%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/rbd-details.json (96%) rename monitoring/{grafana/dashboards => ceph-mixin/dashboards_out}/rbd-overview.json (99%) create mode 100644 monitoring/ceph-mixin/jsonnetfile.json create mode 100644 monitoring/ceph-mixin/jsonnetfile.lock.json create mode 100755 monitoring/ceph-mixin/lint-jsonnet.sh create mode 100644 monitoring/ceph-mixin/mixin.libsonnet rename monitoring/{prometheus/alerts/ceph_default_alerts.yml => ceph-mixin/prometheus_alerts.yaml} (99%) rename monitoring/{prometheus/tests/requirements.txt => ceph-mixin/requirements-alerts.txt} (100%) rename monitoring/{grafana/dashboards => ceph-mixin}/requirements-grafonnet.txt (100%) rename monitoring/{grafana/dashboards => ceph-mixin}/requirements-lint.txt (100%) create mode 100755 monitoring/ceph-mixin/test-jsonnet.sh rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/README.md (100%) rename monitoring/{grafana/dashboards/tests/features => ceph-mixin/tests_alerts}/__init__.py (100%) create mode 100644 monitoring/ceph-mixin/tests_alerts/settings.py rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/test_alerts.yml (99%) rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/test_syntax.py (100%) rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/test_unittests.py (100%) rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/utils.py (100%) rename monitoring/{prometheus/tests => ceph-mixin/tests_alerts}/validate_rules.py (98%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/__init__.py (100%) rename monitoring/{prometheus/tests => ceph-mixin/tests_dashboards/features}/__init__.py (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/ceph-cluster.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/environment.py (97%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/host-details.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/hosts_overview.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/osd-device-details.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/osds-overview.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/radosgw-detail.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/radosgw_overview.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/self.feature (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/features/steps/__init__.py (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/requirements.txt (100%) rename monitoring/{grafana/dashboards/tests => ceph-mixin/tests_dashboards}/util.py (95%) create mode 100644 monitoring/ceph-mixin/tox.ini delete mode 100644 monitoring/grafana/README.md delete mode 120000 monitoring/grafana/dashboards/.pylintrc delete mode 100644 monitoring/grafana/dashboards/CMakeLists.txt delete mode 100644 monitoring/grafana/dashboards/README delete mode 100644 monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet delete mode 100644 monitoring/grafana/dashboards/test-jsonnet.sh delete mode 100644 monitoring/grafana/dashboards/tox.ini delete mode 100644 monitoring/prometheus/CMakeLists.txt delete mode 100644 monitoring/prometheus/README.md delete mode 100644 monitoring/prometheus/tests/CMakeLists.txt delete mode 100644 monitoring/prometheus/tests/settings.py delete mode 100644 monitoring/prometheus/tests/tox.ini diff --git a/CMakeLists.txt b/CMakeLists.txt index 12672dbccfe74..b07d68acdd438 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -685,8 +685,6 @@ add_custom_target(check add_subdirectory(src) add_subdirectory(qa) -add_subdirectory(monitoring) - add_subdirectory(doc) if(WITH_MANPAGE) add_subdirectory(man) @@ -702,9 +700,7 @@ if(LINUX) endif() option(WITH_GRAFANA "install grafana dashboards" OFF) -if(WITH_GRAFANA) - add_subdirectory(monitoring/grafana/dashboards) -endif() +add_subdirectory(monitoring/ceph-mixin) CMAKE_DEPENDENT_OPTION(WITH_BOOST_VALGRIND "Boost support for valgrind" OFF "NOT WITH_SYSTEM_BOOST" OFF) diff --git a/ceph.spec.in b/ceph.spec.in index ca97722d4467e..eaac334fcaa6a 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -1420,7 +1420,7 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rbd-mirror # prometheus alerts -install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml +install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml %{buildroot}/etc/prometheus/ceph/ceph_default_alerts.yml %if 0%{?suse_version} # create __pycache__ directories and their contents @@ -2463,8 +2463,6 @@ exit 0 %endif %attr(0755,root,root) %dir %{_sysconfdir}/grafana/dashboards/ceph-dashboard %config %{_sysconfdir}/grafana/dashboards/ceph-dashboard/* -%doc monitoring/grafana/dashboards/README -%doc monitoring/grafana/README.md %files prometheus-alerts %if 0%{?suse_version} diff --git a/debian/rules b/debian/rules index 1c215a76e5020..bdfab19493715 100755 --- a/debian/rules +++ b/debian/rules @@ -61,7 +61,7 @@ override_dh_auto_install: install -m 755 src/cephadm/cephadm $(DESTDIR)/usr/sbin/cephadm - install -m 644 -D monitoring/prometheus/alerts/ceph_default_alerts.yml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml + install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yaml $(DESTDIR)/etc/prometheus/ceph/ceph_default_alerts.yml # doc/changelog is a directory, which confuses dh_installchangelogs override_dh_installchangelogs: diff --git a/monitoring/CMakeLists.txt b/monitoring/CMakeLists.txt deleted file mode 100644 index 7d0155c5f13e9..0000000000000 --- a/monitoring/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(prometheus) diff --git a/monitoring/ceph-mixin/.gitignore b/monitoring/ceph-mixin/.gitignore new file mode 100644 index 0000000000000..22d0d82f8095e --- /dev/null +++ b/monitoring/ceph-mixin/.gitignore @@ -0,0 +1 @@ +vendor diff --git a/monitoring/ceph-mixin/.pylintrc b/monitoring/ceph-mixin/.pylintrc new file mode 120000 index 0000000000000..26d91e4cd8665 --- /dev/null +++ b/monitoring/ceph-mixin/.pylintrc @@ -0,0 +1 @@ +../../src/pybind/mgr/dashboard/.pylintrc \ No newline at end of file diff --git a/monitoring/ceph-mixin/CMakeLists.txt b/monitoring/ceph-mixin/CMakeLists.txt new file mode 100644 index 0000000000000..8621c26734250 --- /dev/null +++ b/monitoring/ceph-mixin/CMakeLists.txt @@ -0,0 +1,53 @@ +if(WITH_GRAFANA) + set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard" + CACHE PATH "Location for grafana dashboards") + file(GLOB CEPH_GRAFANA_DASHBOARDS "dashboards_out/*.json") + install(FILES + ${CEPH_GRAFANA_DASHBOARDS} + DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR}) + if(WITH_TESTS) + ExternalProject_Add(jsonnet-bundler + GIT_REPOSITORY "https://github.com/jsonnet-bundler/jsonnet-bundler.git" + GIT_TAG "v0.4.0" + GIT_SHALLOW TRUE + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/jsonnet-bundler + CONFIGURE_COMMAND "" + DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + BUILD_COMMAND make build + BUILD_IN_SOURCE 1 + INSTALL_COMMAND cp /_output/jb ) + + set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR}) + if(NOT CEPH_BUILD_VIRTUALENV) + include(AddCephTest) + set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR}) + add_tox_test(grafana-lint TOX_ENVS lint) + add_tox_test(jsonnet-lint TOX_ENVS jsonnet-lint) + add_tox_test(jsonnet-check TOX_ENVS jsonnet-check) + add_tox_test(alerts-check TOX_ENVS alerts-check) + add_tox_test(alerts-lint TOX_ENVS alerts-lint) + add_tox_test(promql-query-test TOX_ENVS promql-query-test) + endif() + + if(DEFINED PROMTOOL_EXECUTABLE) + set(promtool_executable_checked TRUE) + endif() + + find_program(PROMTOOL_EXECUTABLE promtool) + if(PROMTOOL_EXECUTABLE) + execute_process( + COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null + RESULT_VARIABLE rc + OUTPUT_QUIET) + if(NOT rc) + add_ceph_test(run-promtool-unittests + ${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/ceph-mixin/tests_alerts/test_alerts.yml) + elseif(NOT promtool_executable_checked) + message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, " + "please use a newer prometheus") + endif() + elseif(NOT promtool_executable_checked) + message(WARNING "run-promtool-unittests is skipped due to missing promtool") + endif() + endif() +endif() diff --git a/monitoring/ceph-mixin/Makefile b/monitoring/ceph-mixin/Makefile new file mode 100644 index 0000000000000..44575b77eb6a4 --- /dev/null +++ b/monitoring/ceph-mixin/Makefile @@ -0,0 +1,24 @@ +all: fmt generate lint test + +fmt: + ./lint-jsonnet.sh -i + +generate: dashboards_out + +vendor: jsonnetfile.lock.json + tox -ejsonnet-bundler-install + +dashboards_out: vendor $(JSONNETS_FILES) + tox -ejsonnet-fix + +lint: + tox -ejsonnet-lint + tox -ealerts-lint + +test: generate + tox -ejsonnet-check + tox -epromql-query-test + tox -ealerts-check +check: test + +.PHONY: all fmt generate lint test check diff --git a/monitoring/ceph-mixin/README.md b/monitoring/ceph-mixin/README.md new file mode 100644 index 0000000000000..164b73b881c66 --- /dev/null +++ b/monitoring/ceph-mixin/README.md @@ -0,0 +1,52 @@ +## Prometheus Monitoring Mixin for Ceph +A set of Grafana dashboards and Prometheus alerts for Ceph. + +All the Grafana dashboards are already generated in the `dashboards_out` +directory and alerts in the `prometheus_alerts.yaml` file. + +You can use the Grafana dashboards and alerts with Jsonnet like any other +prometheus mixin. You can find more ressources about mixins in general on +[monitoring.mixins.dev](https://monitoring.mixins.dev/). + +### Grafana dashboards for Ceph +In `dashboards_out` you can find a collection of +[Grafana](https://grafana.com/grafana) dashboards for Ceph Monitoring. + +These dashboards are based on metrics collected +from [prometheus](https://prometheus.io/) scraping the [prometheus mgr +plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the +[node_exporter](https://github.com/prometheus/node_exporter). + +#### Requirements + +- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed on + your Grafana instance +- [Pie Chart Panel](https://grafana.com/grafana/plugins/grafana-piechart-panel/) + installed on your Grafana instance + + +### Prometheus alerts +In `prometheus_alerts.yaml` you'll find a set of Prometheus +alert rules that should provide a decent set of default alerts for a +Ceph cluster. Just put this file in a place according to your Prometheus +configuration (wherever the `rules` configuration stanza points). + +#### SNMP +Ceph provides a MIB (CEPH-PROMETHEUS-ALERT-MIB.txt) to support sending Prometheus +alerts through to an SNMP management platform. The translation from Prometheus +alert to SNMP trap requires the Prometheus alert to contain an OID that maps to +a definition within the MIB. When making changes to the Prometheus alert rules +file, developers should include any necessary changes to the MIB. + +### Building from Jsonnet + +- Install [jsonnet](https://jsonnet.org/) + - By installing the package `jsonnet` in most of the distro and + `golang-github-google-jsonnet` in fedora +- Install [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) + +To rebuild all the generated files, you can run `tox -egrafonnet-fix`. + +The jsonnet code located in this directory depends on some Jsonnet third party +libraries. To update those libraries you can run `jb update` and then update +the generated files using `tox -egrafonnet-fix`. diff --git a/monitoring/ceph-mixin/alerts.libsonnet b/monitoring/ceph-mixin/alerts.libsonnet new file mode 100644 index 0000000000000..8671637de5d5d --- /dev/null +++ b/monitoring/ceph-mixin/alerts.libsonnet @@ -0,0 +1,3 @@ +{ + prometheusAlerts+:: std.parseYaml(importstr 'prometheus_alerts.yaml'), +} diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet new file mode 100644 index 0000000000000..0967ef424bce6 --- /dev/null +++ b/monitoring/ceph-mixin/config.libsonnet @@ -0,0 +1 @@ +{} diff --git a/monitoring/ceph-mixin/dashboards.jsonnet b/monitoring/ceph-mixin/dashboards.jsonnet new file mode 100644 index 0000000000000..9d913ed3f18c4 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/monitoring/ceph-mixin/dashboards/cephfs.libsonnet b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet new file mode 100644 index 0000000000000..3dabc1608ad35 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/cephfs.libsonnet @@ -0,0 +1,103 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'cephfs-overview.json': + local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'MDS Performance', + '', + 'tbO9LAiZz', + 'now-1h', + '15s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('mds_servers', + '$datasource', + 'label_values(ceph_mds_inodes, ceph_daemon)', + 1, + true, + 1, + 'MDS Server', + '') + ) + .addPanels([ + u.addRowSchema(false, true, 'MDS Performance') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + CephfsOverviewGraphPanel( + 'MDS Workload - $mds_servers', + 'none', + 'Reads(-) / Writes (+)', + 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))', + 'Read Ops', + 0, + 1, + 12, + 9 + ) + .addTarget(u.addTargetSchema( + 'sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))', + 1, + 'time_series', + 'Write Ops' + )) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + CephfsOverviewGraphPanel( + 'Client Request Load - $mds_servers', + 'none', + 'Client Requests', + 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}', + '{{ceph_daemon}}', + 12, + 1, + 12, + 9 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/dashboards.libsonnet b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 0000000000000..72ca483248f8e --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,6 @@ +(import 'cephfs.libsonnet') + +(import 'host.libsonnet') + +(import 'osd.libsonnet') + +(import 'pool.libsonnet') + +(import 'rbd.libsonnet') + +(import 'rgw.libsonnet') diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet new file mode 100644 index 0000000000000..b2ee5c94f0043 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -0,0 +1,562 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'hosts-overview.json': + local HostsOverviewSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget( + u.addTargetSchema(expr, 1, targetFormat, '') + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = + u.graphPanelSchema( + {}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource' + ) + .addTargets( + [u.addTargetSchema( + expr, 1, 'time_series', legendFormat + )] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Host Overview', + '', + 'y0KGL0iZz', + 'now-1h', + '10s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('osd_hosts', + '$datasource', + 'label_values(ceph_disk_occupation, exported_instance)', + 1, + true, + 1, + null, + '([^.]*).*') + ) + .addTemplate( + u.addTemplateSchema('mon_hosts', + '$datasource', + 'label_values(ceph_mon_metadata, ceph_daemon)', + 1, + true, + 1, + null, + 'mon.(.*)') + ) + .addTemplate( + u.addTemplateSchema('mds_hosts', + '$datasource', + 'label_values(ceph_mds_inodes, ceph_daemon)', + 1, + true, + 1, + null, + 'mds.(.*)') + ) + .addTemplate( + u.addTemplateSchema('rgw_hosts', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + null, + 'rgw.(.*)') + ) + .addPanels([ + HostsOverviewSingleStatPanel( + 'none', + 'OSD Hosts', + '', + 'current', + 'count(sum by (hostname) (ceph_osd_metadata))', + 'time_series', + 0, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG CPU Busy', + 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', + 'current', + 'avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )', + 'time_series', + 4, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percentunit', + 'AVG RAM Utilization', + 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', + 'current', + 'avg (((node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})- (\n (node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +\n (node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})\n )) /\n (node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} ))', + 'time_series', + 8, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'none', + 'Physical IOPS', + 'IOPS Load at the device as reported by the OS on all OSD hosts', + 'current', + 'sum ((irate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[5m])))', + 'time_series', + 12, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'percent', + 'AVG Disk Utilization', + 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', + 'current', + 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)', + 'time_series', + 16, + 0, + 4, + 5 + ), + HostsOverviewSingleStatPanel( + 'bytes', + 'Network Load', + 'Total send/receive network load across all hosts in the ceph cluster', + 'current', + ||| + sum ( + ( + irate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + + sum ( + ( + irate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") + ) + ||| + , + 'time_series', + 20, + 0, + 4, + 5 + ), + HostsOverviewGraphPanel( + 'CPU Busy - Top 10 Hosts', + 'Show the top 10 busiest hosts by cpu', + 'percent', + 'topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )\n)', + '{{instance}}', + 0, + 5, + 12, + 9 + ), + HostsOverviewGraphPanel( + 'Network Load - Top 10 Hosts', 'Top 10 hosts by network load', 'Bps', ||| + topk(10, (sum by(instance) ( + ( + irate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + ) + + ( + irate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or + irate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) + ) unless on (device, instance) + label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) + )) + ||| + , '{{instance}}', 12, 5, 12, 9 + ), + ]), + 'host-details.json': + local HostDetailsSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget(u.addTargetSchema(expr, + 1, + targetFormat, + '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local HostDetailsGraphPanel(alias, + title, + description, + nullPointMode, + formatY1, + labelY1, + expr, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + nullPointMode, + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, + 1, + 'time_series', + legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Host Details', + '', + 'rtOg0AiWz', + 'now-1h', + '10s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('ceph_hosts', '$datasource', 'label_values(node_scrape_collector_success, instance) ', 1, false, 3, 'Hostname', '([^.:]*).*') + ) + .addPanels([ + u.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + HostDetailsSingleStatPanel( + 'none', + 'OSDs', + '', + 'current', + "count(sum by (ceph_daemon) (ceph_osd_metadata{hostname='$ceph_hosts'}))", + 'time_series', + 0, + 1, + 3, + 5 + ), + HostDetailsGraphPanel( + { + interrupt: '#447EBC', + steal: '#6D1F62', + system: '#890F02', + user: '#3F6833', + wait: '#C15C17', + }, 'CPU Utilization', "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", 'null', 'percent', '% Utilization', 'sum by (mode) (\n irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m])\n) / scalar(\n sum(irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]))\n) * 100', '{{mode}}', 3, 1, 6, 10 + ), + HostDetailsGraphPanel( + { + Available: '#508642', + Free: '#508642', + Total: '#bf1b00', + Used: '#bf1b00', + total: '#bf1b00', + used: '#0a50a1', + }, + 'RAM Usage', + '', + 'null', + 'bytes', + 'RAM used', + 'node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', + 'Free', + 9, + 1, + 6, + 10 + ) + .addTargets( + [ + u.addTargetSchema('node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', 1, 'time_series', 'total'), + u.addTargetSchema('(node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n(node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n(node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) \n', 1, 'time_series', 'buffers/cache'), + u.addTargetSchema('(node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})- (\n (node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n (node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})\n )\n \n', 1, 'time_series', 'used'), + ] + ) + .addSeriesOverride( + { + alias: 'total', + color: '#bf1b00', + fill: 0, + linewidth: 2, + stack: false, + } + ), + HostDetailsGraphPanel( + {}, + 'Network Load', + "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + 'null', + 'decbytes', + 'Send (-) / Receive (+)', + 'sum by (device) (\n irate(node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or \n irate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', + '{{device}}.rx', + 15, + 1, + 6, + 10 + ) + .addTargets( + [ + u.addTargetSchema('sum by (device) (\n irate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', 1, 'time_series', '{{device}}.tx'), + ] + ) + .addSeriesOverride( + { alias: '/.*tx/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + 'Network drop rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + 'irate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', + '{{device}}.rx', + 21, + 1, + 3, + 5 + ) + .addTargets( + [ + u.addTargetSchema( + 'irate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', 1, 'time_series', '{{device}}.tx' + ), + ] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + HostDetailsSingleStatPanel( + 'bytes', + 'Raw Capacity', + 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', + 'current', + 'sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"})', + 'time_series', + 0, + 6, + 3, + 5 + ), + HostDetailsGraphPanel( + {}, + 'Network error rate', + '', + 'null', + 'pps', + 'Send (-) / Receive (+)', + 'irate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', + '{{device}}.rx', + 21, + 6, + 3, + 5 + ) + .addTargets( + [u.addTargetSchema( + 'irate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', 1, 'time_series', '{{device}}.tx' + )] + ) + .addSeriesOverride( + { + alias: '/.*tx/', + transform: 'negative-Y', + } + ), + u.addRowSchema(false, + true, + 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk IOPS', + "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + 'connected', + 'ops', + 'Read (-) / Write (+)', + 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', + '{{device}}({{ceph_daemon}}) writes', + 0, + 12, + 11, + 9 + ) + .addTargets( + [ + u.addTargetSchema( + 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', + 1, + 'time_series', + '{{device}}({{ceph_daemon}}) reads' + ), + ] + ) + .addSeriesOverride( + { alias: '/.*reads/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Throughput by Disk', + 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', + 'connected', + 'Bps', + 'Read (-) / Write (+)', + 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}}) write', + 12, + 12, + 11, + 9 + ) + .addTargets( + [u.addTargetSchema( + 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 1, + 'time_series', + '{{device}}({{ceph_daemon}}) read' + )] + ) + .addSeriesOverride( + { alias: '/.*read/', transform: 'negative-Y' } + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk Latency', + "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + 'null as zero', + 's', + '', + 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}})', + 0, + 21, + 11, + 9 + ), + HostDetailsGraphPanel( + {}, + '$ceph_hosts Disk utilization', + 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', + 'connected', + 'percent', + '%Util', + 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}}({{ceph_daemon}})', + 12, + 21, + 11, + 9 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet new file mode 100644 index 0000000000000..8b425fb395a37 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -0,0 +1,533 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'osds-overview.json': + local OsdOverviewStyle(alias, pattern, type, unit) = + u.addStyle(alias, null, [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); + local OsdOverviewGraphPanel(alias, + title, + description, + formatY1, + labelY1, + min, + expr, + legendFormat1, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + min, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + local OsdOverviewPieChartPanel(alias, description, title) = + u.addPieChartSchema(alias, + '$datasource', + description, + 'Under graph', + 'pie', + title, + 'current'); + local OsdOverviewSingleStatPanel(colors, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema( + colors, + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds + ) + .addTarget( + u.addTargetSchema(expr, 1, targetFormat, '') + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'OSD Overview', + '', + 'lo02I1Aiz', + 'now-1h', + '10s', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addPanels([ + OsdOverviewGraphPanel( + { '@95%ile': '#e0752d' }, + 'OSD Read Latencies', + '', + 'ms', + null, + '0', + 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', + 'AVG read', + 0, + 0, + 8, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', + 1, + 'time_series', + 'MAX read' + ), + u.addTargetSchema( + 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile' + ), + ], + ), + u.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest READ Latencies', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', 1, 'table', '' + ) + ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } }, + OsdOverviewGraphPanel( + { + '@95%ile write': '#e0752d', + }, + 'OSD Write Latencies', + '', + 'ms', + null, + '0', + 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', + 'AVG write', + 12, + 0, + 8, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', + 1, + 'time_series', + 'MAX write' + ), + u.addTargetSchema( + 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile write' + ), + ], + ), + u.addTableSchema( + '$datasource', + "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + { col: 2, desc: true }, + [ + OsdOverviewStyle( + 'OSD ID', 'ceph_daemon', 'string', 'short' + ), + OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'), + OsdOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest WRITE Latencies', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, '', 'OSD Types Summary' + ) + .addTarget( + u.addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}') + ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types' + ) + .addTarget( + u.addTargetSchema( + 'count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore' + ) + ) + .addTarget( + u.addTargetSchema( + 'absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore' + ) + ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } }, + OsdOverviewPieChartPanel( + {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary' + ) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB' + )) + .addTarget(u.addTargetSchema( + 'count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+' + )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } }, + g.graphPanel.new(bars=true, + datasource='$datasource', + title='Distribution of PGs per OSD', + x_axis_buckets=20, + x_axis_mode='histogram', + x_axis_values=['total'], + formatY1='short', + formatY2='short', + labelY1='# of OSDs', + min='0', + nullPointMode='null') + .addTarget(u.addTargetSchema( + 'ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD' + )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } }, + OsdOverviewSingleStatPanel( + ['#d44a3a', '#299c46'], + 'percentunit', + 'OSD onode Hits Ratio', + 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', + 'current', + true, + 1, + true, + false, + '.75', + 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))', + 'time_series', + 20, + 8, + 4, + 8 + ), + u.addRowSchema(false, + true, + 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } }, + OsdOverviewGraphPanel( + {}, + 'Read/Write Profile', + 'Show the read/write workload profile overtime', + 'short', + null, + null, + 'round(sum(irate(ceph_pool_rd[30s])))', + 'Reads', + 0, + 17, + 24, + 8 + ) + .addTargets([u.addTargetSchema( + 'round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes' + )]), + ]), + 'osd-device-details.json': + local OsdDeviceDetailsPanel(title, + description, + formatY1, + labelY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + description, + 'null', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + legendFormat1), + u.addTargetSchema(expr2, 1, 'time_series', legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'OSD device details', + '', + 'CrAHE0iZz', + 'now-3h', + '', + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('osd', + '$datasource', + 'label_values(ceph_osd_metadata,ceph_daemon)', + 1, + false, + 1, + 'OSD', + '(.*)') + ) + .addPanels([ + u.addRowSchema( + false, true, 'OSD Performance' + ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + '$osd Latency', + '', + 's', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])', + 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])', + 'read', + 'write', + 0, + 1, + 6, + 9 + ) + .addSeriesOverride( + { + alias: 'read', + transform: 'negative-Y', + } + ), + OsdDeviceDetailsPanel( + '$osd R/W IOPS', + '', + 'short', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])', + 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])', + 'Reads', + 'Writes', + 6, + 1, + 6, + 9 + ) + .addSeriesOverride( + { alias: 'Reads', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + '$osd R/W Bytes', + '', + 'bytes', + 'Read (-) / Write (+)', + 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])', + 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])', + 'Read Bytes', + 'Write Bytes', + 12, + 1, + 6, + 9 + ) + .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }), + u.addRowSchema( + false, true, 'Physical Device Performance' + ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } }, + OsdDeviceDetailsPanel( + 'Physical Device Latency for $osd', + '', + 's', + 'Read (-) / Write (+)', + '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', + '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', + '{{instance}}/{{device}} Reads', + '{{instance}}/{{device}} Writes', + 0, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W IOPS for $osd', + '', + 'short', + 'Read (-) / Write (+)', + 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{device}} on {{instance}} Writes', + '{{device}} on {{instance}} Reads', + 6, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + OsdDeviceDetailsPanel( + 'Physical Device R/W Bytes for $osd', + '', + 'Bps', + 'Read (-) / Write (+)', + 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + '{{instance}} {{device}} Reads', + '{{instance}} {{device}} Writes', + 12, + 11, + 6, + 9 + ) + .addSeriesOverride( + { alias: '/.*Reads/', transform: 'negative-Y' } + ), + u.graphPanelSchema( + {}, + 'Physical Device Util% for $osd', + '', + 'null', + false, + 'percentunit', + 'short', + null, + null, + null, + 1, + '$datasource' + ) + .addTarget(u.addTargetSchema( + 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', + 1, + 'time_series', + '{{device}} on {{instance}}' + )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/pool.libsonnet b/monitoring/ceph-mixin/dashboards/pool.libsonnet new file mode 100644 index 0000000000000..527c9124ba2c5 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/pool.libsonnet @@ -0,0 +1,570 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'pool-overview.json': + local PoolOverviewSingleStatPanel(format, + title, + description, + valueName, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + false, + 100, + false, + false, + '') + .addTarget(u.addTargetSchema(expr, 1, targetFormat, '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local PoolOverviewStyle(alias, + pattern, + type, + unit, + colorMode, + thresholds, + valueMaps) = + u.addStyle(alias, + colorMode, + [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + thresholds, + type, + unit, + valueMaps); + + local PoolOverviewGraphPanel(title, + description, + formatY1, + labelY1, + expr, + targetFormat, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, + 1, + 'time_series', + legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Ceph Pools Overview', + '', + 'z99hzWtmk', + 'now-1h', + '15s', + 22, + [], + '', + { refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'] } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'Dashboard1', + label='Data Source') + ) + .addTemplate( + g.template.custom(label='TopK', + name='topk', + current='15', + query='15') + ) + .addPanels([ + PoolOverviewSingleStatPanel( + 'none', + 'Pools', + '', + 'avg', + 'count(ceph_pool_metadata)', + 'table', + 0, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Pools with Compression', + 'Count of the pools that have compression enabled', + 'current', + 'count(ceph_pool_metadata{compression_mode!="none"})', + '', + 3, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Total Raw Capacity', + 'Total raw capacity available to the cluster', + 'current', + 'sum(ceph_osd_stat_bytes)', + '', + 6, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Raw Capacity Consumed', + 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', + 'current', + 'sum(ceph_pool_bytes_used)', + '', + 9, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Logical Stored ', + 'Total of client data stored in the cluster', + 'current', + 'sum(ceph_pool_stored)', + '', + 12, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'bytes', + 'Compression Savings', + 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', + 'current', + 'sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)', + '', + 15, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'percent', + 'Compression Eligibility', + 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data\n', + 'current', + '(sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0)) * 100', + 'table', + 18, + 0, + 3, + 3 + ), + PoolOverviewSingleStatPanel( + 'none', + 'Compression Factor', + 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', + 'current', + 'sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)', + '', + 21, + 0, + 3, + 3 + ), + u.addTableSchema( + '$datasource', + '', + { col: 5, desc: true }, + [ + PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []), + PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []), + PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []), + PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70', '85'], []), + PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []), + PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []), + PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []), + PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []), + PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []), + PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []), + PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []), + PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []), + PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []), + PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{ text: 'ON', value: '1' }]), + ], + 'Pool Overview', + 'table' + ) + .addTargets( + [ + u.addTargetSchema( + '(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5)', + 1, + 'table', + 'A' + ), + u.addTargetSchema( + 'ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata', + 1, + 'table', + 'B' + ), + u.addTargetSchema( + '((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100', + 1, + 'table', + 'C' + ), + u.addTargetSchema( + '(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)', + 1, + 'table', + 'D' + ), + u.addTargetSchema( + '(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)', + 1, + 'table', + 'E' + ), + u.addTargetSchema( + 'delta(ceph_pool_stored[5d])', 1, 'table', 'F' + ), + u.addTargetSchema( + 'rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])', + 1, + 'table', + 'G' + ), + u.addTargetSchema( + 'rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])', + 1, + 'table', + 'H' + ), + u.addTargetSchema( + 'ceph_pool_metadata', 1, 'table', 'I' + ), + u.addTargetSchema( + 'ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata', + 1, + 'table', + 'J' + ), + u.addTargetSchema( + 'ceph_pool_metadata{compression_mode!="none"}', 1, 'table', 'K' + ), + u.addTargetSchema('', '', '', 'L'), + ] + ) + { gridPos: { x: 0, y: 3, w: 24, h: 6 } }, + PoolOverviewGraphPanel( + 'Top $topk Client IOPS by Pool', + 'This chart shows the sum of read and write IOPS from all clients by pool', + 'short', + 'IOPS', + 'topk($topk,round((rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])),1) * on(pool_id) group_left(instance,name) ceph_pool_metadata) ', + 'time_series', + '{{name}} ', + 0, + 9, + 12, + 8 + ) + .addTarget( + u.addTargetSchema( + 'topk($topk,rate(ceph_pool_wr[30s]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ', + 1, + 'time_series', + '{{name}} - write' + ) + ), + PoolOverviewGraphPanel( + 'Top $topk Client Bandwidth by Pool', + 'The chart shows the sum of read and write bytes from all clients, by pool', + 'Bps', + 'Throughput', + 'topk($topk,(rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) * on(pool_id) group_left(instance,name) ceph_pool_metadata)', + 'time_series', + '{{name}}', + 12, + 9, + 12, + 8 + ), + PoolOverviewGraphPanel( + 'Pool Capacity Usage (RAW)', + 'Historical view of capacity usage, to help identify growth and trends in pool consumption', + 'bytes', + 'Capacity Used', + 'ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata', + '', + '{{name}}', + 0, + 17, + 24, + 7 + ), + ]), + 'pool-detail.json': + local PoolDetailSingleStatPanel(format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds, + expr, + targetFormat, + x, + y, + w, + h) = + u.addSingleStatSchema(['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + '$datasource', + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparkLineShow, + thresholds) + .addTarget(u.addTargetSchema(expr, 1, targetFormat, '')) + { gridPos: { x: x, y: y, w: w, h: h } }; + + local PoolDetailGraphPanel(alias, + title, + description, + formatY1, + labelY1, + expr, + targetFormat, + legendFormat, + x, + y, + w, + h) = + u.graphPanelSchema(alias, + title, + description, + 'null as zero', + false, + formatY1, + 'short', + labelY1, + null, + null, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr, 1, 'time_series', legendFormat)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'Ceph Pool Details', + '', + '-xyV8KCiz', + 'now-1h', + '15s', + 22, + [], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='panel', id='singlestat', name='Singlestat', version='5.0.0' + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'Prometheus admin.virt1.home.fajerski.name:9090', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('pool_name', + '$datasource', + 'label_values(ceph_pool_metadata,name)', + 1, + false, + 1, + 'Pool Name', + '') + ) + .addPanels([ + PoolDetailSingleStatPanel( + 'percentunit', + 'Capacity used', + '', + 'current', + true, + 1, + true, + true, + '.7,.8', + '(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 0, + 0, + 7, + 7 + ), + PoolDetailSingleStatPanel( + 's', + 'Time till full', + 'Time till pool is full assuming the average fill rate of the last 6 hours', + false, + 100, + false, + false, + '', + 'current', + '(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} > 0', + 'time_series', + 7, + 0, + 5, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: + '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Object Ingress/Egress', + '', + 'ops', + 'Objects out(-) / in(+) ', + 'deriv(ceph_pool_objects[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'Objects per second', + 12, + 0, + 12, + 7 + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, '$pool_name Client IOPS', '', 'iops', 'Read (-) / Write (+)', 'irate(ceph_pool_rd[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', 'time_series', 'reads', 0, 7, 12, 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + u.addTargetSchema( + 'irate(ceph_pool_wr[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', 1, 'time_series', 'writes' + ) + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Client Throughput', + '', + 'Bps', + 'Read (-) / Write (+)', + 'irate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'reads', + 12, + 7, + 12, + 7 + ) + .addSeriesOverride({ alias: 'reads', transform: 'negative-Y' }) + .addTarget( + u.addTargetSchema( + 'irate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 1, + 'time_series', + 'writes' + ) + ), + PoolDetailGraphPanel( + { + read_op_per_sec: '#3F6833', + write_op_per_sec: '#E5AC0E', + }, + '$pool_name Objects', + '', + 'short', + 'Objects', + 'ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', + 'time_series', + 'Number of Objects', + 0, + 14, + 12, + 7 + ), + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/monitoring/ceph-mixin/dashboards/rbd.libsonnet new file mode 100644 index 0000000000000..d464f889f54db --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -0,0 +1,309 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'rbd-details.json': + local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null as zero', + false, + formatY1, + formatY1, + null, + null, + 0, + 1, + '$Datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + '{{pool}} Write'), + u.addTargetSchema(expr2, 1, 'time_series', '{{pool}} Read'), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RBD Details', + 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', + 'YhCYGcuZz', + 'now-1h', + false, + 16, + [], + '', + { + refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('Datasource', 'prometheus', 'default', label=null) + ) + .addTemplate( + u.addTemplateSchema('Pool', + '$Datasource', + 'label_values(pool)', + 1, + false, + 0, + '', + '') + ) + .addTemplate( + u.addTemplateSchema('Image', + '$Datasource', + 'label_values(image)', + 1, + false, + 0, + '', + '') + ) + .addPanels([ + RbdDetailsPanel( + 'IOPS', + 'iops', + 'irate(ceph_rbd_write_ops{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_ops{pool="$Pool", image="$Image"}[30s])', + 0, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Throughput', + 'Bps', + 'irate(ceph_rbd_write_bytes{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_bytes{pool="$Pool", image="$Image"}[30s])', + 8, + 0, + 8, + 9 + ), + RbdDetailsPanel( + 'Average Latency', + 'ns', + 'irate(ceph_rbd_write_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_write_latency_count{pool="$Pool", image="$Image"}[30s])', + 'irate(ceph_rbd_read_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_read_latency_count{pool="$Pool", image="$Image"}[30s])', + 16, + 0, + 8, + 9 + ), + ]), + 'rbd-overview.json': + local RgwOverviewStyle(alias, pattern, type, unit) = + u.addStyle(alias, + null, + ['rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)'], + 'YYYY-MM-DD HH:mm:ss', + 2, + 1, + pattern, + [], + type, + unit, + []); + local RbdOverviewPanel(title, + formatY1, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema({}, + title, + '', + 'null', + false, + formatY1, + 'short', + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [ + u.addTargetSchema(expr1, + 1, + 'time_series', + legendFormat1), + u.addTargetSchema(expr2, + 1, + 'time_series', + legendFormat2), + ] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RBD Overview', + '', + '41FrpeUiz', + 'now-1h', + '30s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.4.2' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addRequired( + type='datasource', id='prometheus', name='Prometheus', version='5.0.0' + ) + .addRequired( + type='panel', id='table', name='Table', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addPanels([ + RbdOverviewPanel( + 'IOPS', + 'short', + 'round(sum(irate(ceph_rbd_write_ops[30s])))', + 'round(sum(irate(ceph_rbd_read_ops[30s])))', + 'Writes', + 'Reads', + 0, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Throughput', + 'Bps', + 'round(sum(irate(ceph_rbd_write_bytes[30s])))', + 'round(sum(irate(ceph_rbd_read_bytes[30s])))', + 'Write', + 'Read', + 8, + 0, + 8, + 7 + ), + RbdOverviewPanel( + 'Average Latency', + 'ns', + 'round(sum(irate(ceph_rbd_write_latency_sum[30s])) / sum(irate(ceph_rbd_write_latency_count[30s])))', + 'round(sum(irate(ceph_rbd_read_latency_sum[30s])) / sum(irate(ceph_rbd_read_latency_count[30s])))', + 'Write', + 'Read', + 16, + 0, + 8, + 7 + ), + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest IOPS', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10, (sort((irate(ceph_rbd_write_ops[30s]) + on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])))))', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 0, y: 7, w: 8, h: 7 } }, + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Throughput', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10, sort(sum(irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])) by (pool, image, namespace)))', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 8, y: 7, w: 8, h: 7 } }, + u.addTableSchema( + '$datasource', + '', + { col: 3, desc: true }, + [ + RgwOverviewStyle('Pool', 'pool', 'string', 'short'), + RgwOverviewStyle('Image', 'image', 'string', 'short'), + RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), + RgwOverviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Highest Latency', + 'table' + ) + .addTarget( + u.addTargetSchema( + 'topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)', + 1, + 'table', + '' + ) + ) + { gridPos: { x: 16, y: 7, w: 8, h: 7 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/monitoring/ceph-mixin/dashboards/rgw.libsonnet new file mode 100644 index 0000000000000..e0ad25fb59b24 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -0,0 +1,643 @@ +local g = import 'grafonnet/grafana.libsonnet'; +local u = import 'utils.libsonnet'; + +{ + grafanaDashboards+:: { + 'radosgw-sync-overview.json': + local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = + u.graphPanelSchema({}, + title, + '', + 'null as zero', + true, + formatY1, + 'short', + labelY1, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric, + 1, + 'time_series', + '{{source_zone}}')] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Sync Overview', + '', + 'rgw-sync-overview', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '') + ) + .addTemplate( + g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') + ) + .addPanels([ + RgwSyncOverviewPanel( + 'Replication (throughput) from Source Zone', + 'Bps', + null, + 'ceph_data_sync_from_zone_fetch_bytes_sum', + 0, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Replication (objects) from Source Zone', + 'short', + 'Objects/s', + 'ceph_data_sync_from_zone_fetch_bytes_count', + 8, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Polling Request Latency from Source Zone', + 'ms', + null, + 'ceph_data_sync_from_zone_poll_latency_sum', + 16, + 0, + 8, + 7 + ), + RgwSyncOverviewPanel( + 'Unsuccessful Object Replications from Source Zone', + 'short', + 'Count/s', + 'ceph_data_sync_from_zone_fetch_errors', + 0, + 7, + 8, + 7 + ), + ]), + 'radosgw-overview.json': + local RgwOverviewPanel( + title, + description, + formatY1, + formatY2, + expr1, + legendFormat1, + x, + y, + w, + h, + datasource='$datasource', + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false + ) = + u.graphPanelSchema( + {}, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + datasource, + legend_alignAsTable, + legend_avg, + legend_min, + legend_max, + legend_current, + legend_values + ) + .addTargets( + [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Overview', + '', + 'WAkugZpiz', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + u.addTemplateSchema( + 'rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + '', + '' + ) + ) + .addTemplate( + u.addTemplateSchema( + 'code', + '$datasource', + 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', + 1, + true, + 1, + 'HTTP Code', + '' + ) + ) + .addTemplate( + u.addTemplateSchema( + 'ingress_service', + '$datasource', + 'label_values(haproxy_server_status, instance)', + 1, + true, + 1, + 'Ingress Service', + '' + ) + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addPanels([ + u.addRowSchema(false, + true, + 'RGW Overview - All Gateways') + + { + gridPos: { x: 0, y: 0, w: 24, h: 1 }, + }, + RgwOverviewPanel( + 'Average GET/PUT Latencies', + '', + 's', + 'short', + 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', + 'GET AVG', + 0, + 1, + 8, + 7 + ).addTargets( + [ + u.addTargetSchema( + 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', + 1, + 'time_series', + 'PUT AVG' + ), + ] + ), + RgwOverviewPanel( + 'Total Requests/sec by RGW Instance', + '', + 'none', + 'short', + 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', + '{{rgw_host}}', + 8, + 1, + 7, + 7 + ), + RgwOverviewPanel( + 'GET Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', + 's', + 'short', + 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 15, + 1, + 6, + 7 + ), + RgwOverviewPanel( + 'Bandwidth Consumed by Type', + 'Total bytes transferred in/out of all radosgw instances within the cluster', + 'bytes', + 'short', + 'sum(rate(ceph_rgw_get_b[30s]))', + 'GETs', + 0, + 8, + 8, + 6 + ).addTargets( + [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))', + 1, + 'time_series', + 'PUTs')] + ), + RgwOverviewPanel( + 'Bandwidth by RGW Instance', + 'Total bytes transferred in/out through get/put operations, by radosgw instance', + 'bytes', + 'short', + 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 8, + 8, + 7, + 6 + ), + RgwOverviewPanel( + 'PUT Latencies by RGW Instance', + 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', + 's', + 'short', + 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', + '{{rgw_host}}', + 15, + 8, + 6, + 6 + ), + u.addRowSchema( + false, true, 'RGW Overview - HAProxy Metrics' + ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } }, + RgwOverviewPanel( + 'Total responses by HTTP code', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)', + 'Frontend {{ code }}', + 0, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + { alias: '/.*1.*/' }, + { alias: '/.*2.*/' }, + { alias: '/.*3.*/' }, + { alias: '/.*4.*/' }, + { alias: '/.*5.*/' }, + { alias: '/.*other.*/' }, + ]), + RgwOverviewPanel( + 'Total requests / responses', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', + 'Requests', + 5, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'), + u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'), + u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'), + u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'), + u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'), + u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Response.*/', + transform: 'negative-Y', + }, + { + alias: '/.*Backend.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Total number of connections', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', + 'Front', + 10, + 12, + 5, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'), + u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*Back.*/', + transform: 'negative-Y', + }, + ]), + RgwOverviewPanel( + 'Current total of incoming / outgoing bytes', + '', + 'short', + 'short', + 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', + 'IN Front', + 15, + 12, + 6, + 12, + '$datasource', + true, + true, + true, + true, + true, + true + ) + .addTargets( + [ + u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'), + u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'), + u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'), + ] + ) + .addSeriesOverride([ + { + alias: '/.*OUT.*/', + transform: 'negative-Y', + }, + ]), + ]), + 'radosgw-detail.json': + local RgwDetailsPanel(aliasColors, + title, + description, + formatY1, + formatY2, + expr1, + expr2, + legendFormat1, + legendFormat2, + x, + y, + w, + h) = + u.graphPanelSchema(aliasColors, + title, + description, + 'null', + false, + formatY1, + formatY2, + null, + null, + 0, + 1, + '$datasource') + .addTargets( + [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)] + ) + { gridPos: { x: x, y: y, w: w, h: h } }; + + u.dashboardSchema( + 'RGW Instance Detail', + '', + 'x5ARzZtmk', + 'now-1h', + '15s', + 16, + ['overview'], + '', + { + refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'], + time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'], + } + ) + .addAnnotation( + u.addAnnotationSchema( + 1, + '-- Grafana --', + true, + true, + 'rgba(0, 211, 255, 1)', + 'Annotations & Alerts', + 'dashboard' + ) + ) + .addRequired( + type='grafana', id='grafana', name='Grafana', version='5.0.0' + ) + .addRequired( + type='panel', + id='grafana-piechart-panel', + name='Pie Chart', + version='1.3.3' + ) + .addRequired( + type='panel', id='graph', name='Graph', version='5.0.0' + ) + .addTemplate( + g.template.datasource('datasource', + 'prometheus', + 'default', + label='Data Source') + ) + .addTemplate( + u.addTemplateSchema('rgw_servers', + '$datasource', + 'label_values(ceph_rgw_metadata, ceph_daemon)', + 1, + true, + 1, + '', + '') + ) + .addPanels([ + u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + RgwDetailsPanel( + {}, + '$rgw_servers GET/PUT Latencies', + '', + 's', + 'short', + 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'GET {{ceph_daemon}}', + 'PUT {{ceph_daemon}}', + 0, + 1, + 6, + 8 + ), + RgwDetailsPanel( + {}, + 'Bandwidth by HTTP Operation', + '', + 'bytes', + 'short', + 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'GETs {{ceph_daemon}}', + 'PUTs {{ceph_daemon}}', + 6, + 1, + 7, + 8 + ), + RgwDetailsPanel( + { + GETs: '#7eb26d', + Other: '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + 'Requests Failed': '#bf1b00', + }, + 'HTTP Request Breakdown', + '', + 'short', + 'short', + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 'Requests Failed {{ceph_daemon}}', + 'GETs {{ceph_daemon}}', + 13, + 1, + 7, + 8 + ) + .addTargets( + [ + u.addTargetSchema( + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'PUTs {{ceph_daemon}}' + ), + u.addTargetSchema( + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Other {{ceph_daemon}}' + ), + ] + ), + u.addPieChartSchema( + { + GETs: '#7eb26d', + 'Other (HEAD,POST,DELETE)': '#447ebc', + PUTs: '#eab839', + Requests: '#3f2b5b', + Failures: '#bf1b00', + }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current' + ) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Failures {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'GETs {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'PUTs {{ceph_daemon}}' + )) + .addTarget(u.addTargetSchema( + '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', + 1, + 'time_series', + 'Other (DELETE,LIST) {{ceph_daemon}}' + )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } }, + ]), + }, +} diff --git a/monitoring/ceph-mixin/dashboards/utils.libsonnet b/monitoring/ceph-mixin/dashboards/utils.libsonnet new file mode 100644 index 0000000000000..f31c0ffe57853 --- /dev/null +++ b/monitoring/ceph-mixin/dashboards/utils.libsonnet @@ -0,0 +1,172 @@ +local g = import 'grafonnet/grafana.libsonnet'; + +{ + dashboardSchema(title, + description, + uid, + time_from, + refresh, + schemaVersion, + tags, + timezone, + timepicker):: + g.dashboard.new(title=title, + description=description, + uid=uid, + time_from=time_from, + refresh=refresh, + schemaVersion=schemaVersion, + tags=tags, + timezone=timezone, + timepicker=timepicker), + + graphPanelSchema(aliasColors, + title, + description, + nullPointMode, + stack, + formatY1, + formatY2, + labelY1, + labelY2, + min, + fill, + datasource, + legend_alignAsTable=false, + legend_avg=false, + legend_min=false, + legend_max=false, + legend_current=false, + legend_values=false):: + g.graphPanel.new(aliasColors=aliasColors, + title=title, + description=description, + nullPointMode=nullPointMode, + stack=stack, + formatY1=formatY1, + formatY2=formatY2, + labelY1=labelY1, + labelY2=labelY2, + min=min, + fill=fill, + datasource=datasource, + legend_alignAsTable=legend_alignAsTable, + legend_avg=legend_avg, + legend_min=legend_min, + legend_max=legend_max, + legend_current=legend_current, + legend_values=legend_values), + + + addTargetSchema(expr, intervalFactor, format, legendFormat):: + g.prometheus.target(expr=expr, + intervalFactor=intervalFactor, + format=format, + legendFormat=legendFormat), + + addTemplateSchema(name, + datasource, + query, + refresh, + includeAll, + sort, + label, + regex):: + g.template.new(name=name, + datasource=datasource, + query=query, + refresh=refresh, + includeAll=includeAll, + sort=sort, + label=label, + regex=regex), + + addAnnotationSchema(builtIn, + datasource, + enable, + hide, + iconColor, + name, + type):: + g.annotation.datasource(builtIn=builtIn, + datasource=datasource, + enable=enable, + hide=hide, + iconColor=iconColor, + name=name, + type=type), + + addRowSchema(collapse, showTitle, title):: + g.row.new(collapse=collapse, showTitle=showTitle, title=title), + + addSingleStatSchema(colors, + datasource, + format, + title, + description, + valueName, + colorValue, + gaugeMaxValue, + gaugeShow, + sparklineShow, + thresholds):: + g.singlestat.new(colors=colors, + datasource=datasource, + format=format, + title=title, + description=description, + valueName=valueName, + colorValue=colorValue, + gaugeMaxValue=gaugeMaxValue, + gaugeShow=gaugeShow, + sparklineShow=sparklineShow, + thresholds=thresholds), + + addPieChartSchema(aliasColors, + datasource, + description, + legendType, + pieType, + title, + valueName):: + g.pieChartPanel.new(aliasColors=aliasColors, + datasource=datasource, + description=description, + legendType=legendType, + pieType=pieType, + title=title, + valueName=valueName), + + addTableSchema(datasource, description, sort, styles, title, transform):: + g.tablePanel.new(datasource=datasource, + description=description, + sort=sort, + styles=styles, + title=title, + transform=transform), + + addStyle(alias, + colorMode, + colors, + dateFormat, + decimals, + mappingType, + pattern, + thresholds, + type, + unit, + valueMaps):: + { + alias: alias, + colorMode: colorMode, + colors: colors, + dateFormat: dateFormat, + decimals: decimals, + mappingType: mappingType, + pattern: pattern, + thresholds: thresholds, + type: type, + unit: unit, + valueMaps: valueMaps, + }, +} diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/ceph-mixin/dashboards_out/ceph-cluster.json similarity index 100% rename from monitoring/grafana/dashboards/ceph-cluster.json rename to monitoring/ceph-mixin/dashboards_out/ceph-cluster.json diff --git a/monitoring/grafana/dashboards/cephfs-overview.json b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json similarity index 99% rename from monitoring/grafana/dashboards/cephfs-overview.json rename to monitoring/ceph-mixin/dashboards_out/cephfs-overview.json index 91a37f0807be3..5c0c27329d69a 100644 --- a/monitoring/grafana/dashboards/cephfs-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/cephfs-overview.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -161,6 +162,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, diff --git a/monitoring/grafana/dashboards/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json similarity index 99% rename from monitoring/grafana/dashboards/host-details.json rename to monitoring/ceph-mixin/dashboards_out/host-details.json index 72014860e0487..7b3c1df152efa 100644 --- a/monitoring/grafana/dashboards/host-details.json +++ b/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -157,6 +157,7 @@ "datasource": "$datasource", "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -249,6 +250,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -363,6 +365,7 @@ "datasource": "$datasource", "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, @@ -460,6 +463,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 5, "w": 3, @@ -639,6 +643,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 5, "w": 3, @@ -755,6 +760,7 @@ "datasource": "$datasource", "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -852,6 +858,7 @@ "datasource": "$datasource", "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -949,6 +956,7 @@ "datasource": "$datasource", "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, @@ -1034,6 +1042,7 @@ "datasource": "$datasource", "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 11, diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json similarity index 93% rename from monitoring/grafana/dashboards/hosts-overview.json rename to monitoring/ceph-mixin/dashboards_out/hosts-overview.json index 758d278cbc6ed..462ddf37bda44 100644 --- a/monitoring/grafana/dashboards/hosts-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/hosts-overview.json @@ -514,7 +514,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum (\n\t(\n\t\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n\t(\n\t\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n\t)\n", + "expr": "sum (\n (\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n )\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -542,6 +542,7 @@ "datasource": "$datasource", "description": "Show the top 10 busiest hosts by cpu", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -627,6 +628,7 @@ "datasource": "$datasource", "description": "Top 10 hosts by network load", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -661,7 +663,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, (sum by(instance) (\n(\n\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "expr": "topk(10, (sum by(instance) (\n(\n irate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n irate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", diff --git a/monitoring/grafana/dashboards/osd-device-details.json b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json similarity index 99% rename from monitoring/grafana/dashboards/osd-device-details.json rename to monitoring/ceph-mixin/dashboards_out/osd-device-details.json index 3c62d179f295e..3b45dc967a53a 100644 --- a/monitoring/grafana/dashboards/osd-device-details.json +++ b/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -161,6 +162,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -258,6 +260,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -374,6 +377,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -471,6 +475,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -568,6 +573,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, @@ -665,6 +671,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 6, diff --git a/monitoring/grafana/dashboards/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json similarity index 98% rename from monitoring/grafana/dashboards/osds-overview.json rename to monitoring/ceph-mixin/dashboards_out/osds-overview.json index 141d520d6f184..dc05689ecb4b9 100644 --- a/monitoring/grafana/dashboards/osds-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -59,6 +59,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -161,6 +162,7 @@ "y": 0 }, "id": 3, + "links": [ ], "sort": { "col": 2, "desc": true @@ -243,6 +245,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -345,6 +348,7 @@ "y": 0 }, "id": 5, + "links": [ ], "sort": { "col": 2, "desc": true @@ -476,19 +480,12 @@ "legendFormat": "bluestore", "refId": "A" }, - { - "expr": "count(ceph_osd_metadata) - count(ceph_bluefs_wal_total_bytes)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "filestore", - "refId": "B" - }, { "expr": "absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)", "format": "time_series", "intervalFactor": 1, "legendFormat": "filestore", - "refId": "C" + "refId": "B" } ], "title": "OSD Objectstore Types", @@ -589,6 +586,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 8, @@ -774,6 +772,7 @@ "datasource": "$datasource", "description": "Show the read/write workload profile overtime", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 24, diff --git a/monitoring/grafana/dashboards/pool-detail.json b/monitoring/ceph-mixin/dashboards_out/pool-detail.json similarity index 99% rename from monitoring/grafana/dashboards/pool-detail.json rename to monitoring/ceph-mixin/dashboards_out/pool-detail.json index e64cc3d82b69b..9a8518e151c61 100644 --- a/monitoring/grafana/dashboards/pool-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-detail.json @@ -217,6 +217,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -305,6 +306,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -405,6 +407,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, @@ -505,6 +508,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, diff --git a/monitoring/grafana/dashboards/pool-overview.json b/monitoring/ceph-mixin/dashboards_out/pool-overview.json similarity index 98% rename from monitoring/grafana/dashboards/pool-overview.json rename to monitoring/ceph-mixin/dashboards_out/pool-overview.json index d6c62e6e54db4..d70d4c7ae02a8 100644 --- a/monitoring/grafana/dashboards/pool-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/pool-overview.json @@ -690,6 +690,7 @@ "y": 3 }, "id": 10, + "links": [ ], "sort": { "col": 5, "desc": true @@ -1052,84 +1053,84 @@ "expr": "(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5)", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "A", "refId": "A" }, { "expr": "ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "B", "refId": "B" }, { "expr": "((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "C", "refId": "C" }, { "expr": "(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "D", "refId": "D" }, { "expr": "(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "E", "refId": "E" }, { "expr": "delta(ceph_pool_stored[5d])", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "F", "refId": "F" }, { "expr": "rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "G", "refId": "G" }, { "expr": "rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "H", "refId": "H" }, { "expr": "ceph_pool_metadata", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "I", "refId": "I" }, { "expr": "ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "J", "refId": "J" }, { "expr": "ceph_pool_metadata{compression_mode!=\"none\"}", "format": "table", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "K", "refId": "K" }, { "expr": "", "format": "", "intervalFactor": "", - "legendFormat": "", + "legendFormat": "L", "refId": "L" } ], @@ -1147,6 +1148,7 @@ "datasource": "$datasource", "description": "This chart shows the sum of read and write IOPS from all clients by pool", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, @@ -1239,6 +1241,7 @@ "datasource": "$datasource", "description": "The chart shows the sum of read and write bytes from all clients, by pool", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 12, @@ -1324,6 +1327,7 @@ "datasource": "$datasource", "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 24, diff --git a/monitoring/grafana/dashboards/radosgw-detail.json b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-detail.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-detail.json index 53486475cbb51..4d68906f2ba07 100644 --- a/monitoring/grafana/dashboards/radosgw-detail.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -70,6 +70,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 6, @@ -162,6 +163,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 7, @@ -260,6 +262,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 8, "w": 7, diff --git a/monitoring/grafana/dashboards/radosgw-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-overview.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 7fe94138b1356..7f9375290fc0c 100644 --- a/monitoring/grafana/dashboards/radosgw-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -64,6 +64,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -156,6 +157,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 7, @@ -241,6 +243,7 @@ "datasource": "$datasource", "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 6, @@ -326,6 +329,7 @@ "datasource": "$datasource", "description": "Total bytes transferred in/out of all radosgw instances within the cluster", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 8, @@ -418,6 +422,7 @@ "datasource": "$datasource", "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 7, @@ -503,6 +508,7 @@ "datasource": "$datasource", "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 6, "w": 6, @@ -607,6 +613,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -724,6 +731,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -862,6 +870,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 5, @@ -968,6 +977,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 12, "w": 6, diff --git a/monitoring/grafana/dashboards/radosgw-sync-overview.json b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json similarity index 99% rename from monitoring/grafana/dashboards/radosgw-sync-overview.json rename to monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json index 442da57590652..232242acc5860 100644 --- a/monitoring/grafana/dashboards/radosgw-sync-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/radosgw-sync-overview.json @@ -45,6 +45,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -130,6 +131,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -215,6 +217,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -300,6 +303,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, diff --git a/monitoring/grafana/dashboards/rbd-details.json b/monitoring/ceph-mixin/dashboards_out/rbd-details.json similarity index 96% rename from monitoring/grafana/dashboards/rbd-details.json rename to monitoring/ceph-mixin/dashboards_out/rbd-details.json index d943b16a64e0a..7a9e1b56b8fc9 100644 --- a/monitoring/grafana/dashboards/rbd-details.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-details.json @@ -45,6 +45,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, @@ -82,14 +83,14 @@ "expr": "irate(ceph_rbd_write_ops{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Write", + "legendFormat": "{{pool}} Write", "refId": "A" }, { "expr": "irate(ceph_rbd_read_ops{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Read", + "legendFormat": "{{pool}} Read", "refId": "B" } ], @@ -137,6 +138,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, @@ -174,14 +176,14 @@ "expr": "irate(ceph_rbd_write_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Write", + "legendFormat": "{{pool}} Write", "refId": "A" }, { "expr": "irate(ceph_rbd_read_bytes{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Read", + "legendFormat": "{{pool}} Read", "refId": "B" } ], @@ -229,6 +231,7 @@ "datasource": "$Datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 9, "w": 8, @@ -266,14 +269,14 @@ "expr": "irate(ceph_rbd_write_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) / irate(ceph_rbd_write_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Write", + "legendFormat": "{{pool}} Write", "refId": "A" }, { "expr": "irate(ceph_rbd_read_latency_sum{pool=\"$Pool\", image=\"$Image\"}[30s]) / irate(ceph_rbd_read_latency_count{pool=\"$Pool\", image=\"$Image\"}[30s])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Read", + "legendFormat": "{{pool}} Read", "refId": "B" } ], diff --git a/monitoring/grafana/dashboards/rbd-overview.json b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json similarity index 99% rename from monitoring/grafana/dashboards/rbd-overview.json rename to monitoring/ceph-mixin/dashboards_out/rbd-overview.json index 5f0ade741e971..71c32ce71fb86 100644 --- a/monitoring/grafana/dashboards/rbd-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -57,6 +57,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -149,6 +150,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -241,6 +243,7 @@ "datasource": "$datasource", "description": "", "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 8, @@ -336,6 +339,7 @@ "y": 7 }, "id": 5, + "links": [ ], "sort": { "col": 3, "desc": true @@ -436,6 +440,7 @@ "y": 7 }, "id": 6, + "links": [ ], "sort": { "col": 3, "desc": true @@ -536,6 +541,7 @@ "y": 7 }, "id": 7, + "links": [ ], "sort": { "col": 3, "desc": true diff --git a/monitoring/ceph-mixin/jsonnetfile.json b/monitoring/ceph-mixin/jsonnetfile.json new file mode 100644 index 0000000000000..93f3316ec3830 --- /dev/null +++ b/monitoring/ceph-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/monitoring/ceph-mixin/jsonnetfile.lock.json b/monitoring/ceph-mixin/jsonnetfile.lock.json new file mode 100644 index 0000000000000..0430b39fc3674 --- /dev/null +++ b/monitoring/ceph-mixin/jsonnetfile.lock.json @@ -0,0 +1,16 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "3626fc4dc2326931c530861ac5bebe39444f6cbf", + "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" + } + ], + "legacyImports": false +} diff --git a/monitoring/ceph-mixin/lint-jsonnet.sh b/monitoring/ceph-mixin/lint-jsonnet.sh new file mode 100755 index 0000000000000..6f77162698246 --- /dev/null +++ b/monitoring/ceph-mixin/lint-jsonnet.sh @@ -0,0 +1,5 @@ +#!/bin/sh -e + +JSONNETS_FILES=$(find . -name 'vendor' -prune -o \ + -name '*.jsonnet' -print -o -name '*.libsonnet' -print) +jsonnetfmt "$@" ${JSONNETS_FILES} diff --git a/monitoring/ceph-mixin/mixin.libsonnet b/monitoring/ceph-mixin/mixin.libsonnet new file mode 100644 index 0000000000000..c89b2a916a891 --- /dev/null +++ b/monitoring/ceph-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'config.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'alerts.libsonnet') diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yaml similarity index 99% rename from monitoring/prometheus/alerts/ceph_default_alerts.yml rename to monitoring/ceph-mixin/prometheus_alerts.yaml index eadb05a05a820..578596f4af0bc 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yaml @@ -898,4 +898,4 @@ groups: description: | One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes don't go unseen. To acknowledge a crash, use the - 'ceph crash archive ' command. \ No newline at end of file + 'ceph crash archive ' command. diff --git a/monitoring/prometheus/tests/requirements.txt b/monitoring/ceph-mixin/requirements-alerts.txt similarity index 100% rename from monitoring/prometheus/tests/requirements.txt rename to monitoring/ceph-mixin/requirements-alerts.txt diff --git a/monitoring/grafana/dashboards/requirements-grafonnet.txt b/monitoring/ceph-mixin/requirements-grafonnet.txt similarity index 100% rename from monitoring/grafana/dashboards/requirements-grafonnet.txt rename to monitoring/ceph-mixin/requirements-grafonnet.txt diff --git a/monitoring/grafana/dashboards/requirements-lint.txt b/monitoring/ceph-mixin/requirements-lint.txt similarity index 100% rename from monitoring/grafana/dashboards/requirements-lint.txt rename to monitoring/ceph-mixin/requirements-lint.txt diff --git a/monitoring/ceph-mixin/test-jsonnet.sh b/monitoring/ceph-mixin/test-jsonnet.sh new file mode 100755 index 0000000000000..fef0443a9ade5 --- /dev/null +++ b/monitoring/ceph-mixin/test-jsonnet.sh @@ -0,0 +1,31 @@ +#!/bin/sh -e + +TEMPDIR=$(mktemp -d) +BASEDIR=$(dirname "$0") + +jsonnet -J vendor -m ${TEMPDIR} $BASEDIR/dashboards.jsonnet + +truncate -s 0 ${TEMPDIR}/json_difference.log +for file in ${BASEDIR}/dashboards_out/*.json +do + file_name="$(basename $file)" + for generated_file in ${TEMPDIR}/*.json + do + generated_file_name="$(basename $generated_file)" + if [ "$file_name" == "$generated_file_name" ]; then + jsondiff --indent 2 "${generated_file}" "${file}" \ + | tee -a ${TEMPDIR}/json_difference.log + fi + done +done + +err=0 +if [ $(wc -l < ${TEMPDIR}/json_difference.log) -eq 0 ] +then + rm -rf ${TEMPDIR} + echo "Congratulations! Grafonnet Check Passed" +else + rm -rf ${TEMPDIR} + echo "Grafonnet Check Failed, failed comparing generated file with existing" + exit 1 +fi diff --git a/monitoring/prometheus/tests/README.md b/monitoring/ceph-mixin/tests_alerts/README.md similarity index 100% rename from monitoring/prometheus/tests/README.md rename to monitoring/ceph-mixin/tests_alerts/README.md diff --git a/monitoring/grafana/dashboards/tests/features/__init__.py b/monitoring/ceph-mixin/tests_alerts/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/features/__init__.py rename to monitoring/ceph-mixin/tests_alerts/__init__.py diff --git a/monitoring/ceph-mixin/tests_alerts/settings.py b/monitoring/ceph-mixin/tests_alerts/settings.py new file mode 100644 index 0000000000000..9dc639fd30cb2 --- /dev/null +++ b/monitoring/ceph-mixin/tests_alerts/settings.py @@ -0,0 +1,11 @@ +import os + +ALERTS_FILE = '../prometheus_alerts.yaml' +UNIT_TESTS_FILE = 'test_alerts.yml' +MIB_FILE = '../../snmp/CEPH-MIB.txt' + +current_dir = os.path.dirname(os.path.abspath(__file__)) + +ALERTS_FILE = os.path.join(current_dir, ALERTS_FILE) +UNIT_TESTS_FILE = os.path.join(current_dir, UNIT_TESTS_FILE) +MIB_FILE = os.path.join(current_dir, MIB_FILE) diff --git a/monitoring/prometheus/tests/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml similarity index 99% rename from monitoring/prometheus/tests/test_alerts.yml rename to monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 7bc4d4f57f2d4..66dacfa286c02 100644 --- a/monitoring/prometheus/tests/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -1,5 +1,5 @@ rule_files: - - ../alerts/ceph_default_alerts.yml + - ../prometheus_alerts.yaml evaluation_interval: 5m tests: # health error @@ -2025,4 +2025,4 @@ tests: description: | One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes don't go unseen. To acknowledge a crash, use the - 'ceph crash archive ' command. \ No newline at end of file + 'ceph crash archive ' command. diff --git a/monitoring/prometheus/tests/test_syntax.py b/monitoring/ceph-mixin/tests_alerts/test_syntax.py similarity index 100% rename from monitoring/prometheus/tests/test_syntax.py rename to monitoring/ceph-mixin/tests_alerts/test_syntax.py diff --git a/monitoring/prometheus/tests/test_unittests.py b/monitoring/ceph-mixin/tests_alerts/test_unittests.py similarity index 100% rename from monitoring/prometheus/tests/test_unittests.py rename to monitoring/ceph-mixin/tests_alerts/test_unittests.py diff --git a/monitoring/prometheus/tests/utils.py b/monitoring/ceph-mixin/tests_alerts/utils.py similarity index 100% rename from monitoring/prometheus/tests/utils.py rename to monitoring/ceph-mixin/tests_alerts/utils.py diff --git a/monitoring/prometheus/tests/validate_rules.py b/monitoring/ceph-mixin/tests_alerts/validate_rules.py similarity index 98% rename from monitoring/prometheus/tests/validate_rules.py rename to monitoring/ceph-mixin/tests_alerts/validate_rules.py index 428779a47de87..c24ce5c59d553 100755 --- a/monitoring/prometheus/tests/validate_rules.py +++ b/monitoring/ceph-mixin/tests_alerts/validate_rules.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 -u +#!/usr/bin/env python3 # # Check the Prometheus rules for format, and integration # with the unit tests. This script has the following exit @@ -27,10 +27,9 @@ import urllib.request import urllib.error from urllib.parse import urlparse +from settings import ALERTS_FILE, MIB_FILE, UNIT_TESTS_FILE + DOCLINK_NAME = 'documentation' -DEFAULT_RULES_FILENAME = '../alerts/ceph_default_alerts.yml' -DEFAULT_TEST_FILENAME = 'test_alerts.yml' -MIB_FILE = '../../snmp/CEPH-MIB.txt' def isascii(s: str) -> bool: @@ -463,8 +462,8 @@ class UnitTests: class RuleChecker: def __init__(self, rules_filename: str = None, test_filename: str = None): - self.rules_filename = rules_filename or DEFAULT_RULES_FILENAME - self.test_filename = test_filename or DEFAULT_TEST_FILENAME + self.rules_filename = rules_filename or ALERTS_FILE + self.test_filename = test_filename or UNIT_TESTS_FILE self.rule_file: Optional[RuleFile] = None self.unit_tests: Optional[UnitTests] = None self.rule_file_problems: bool = False diff --git a/monitoring/grafana/dashboards/tests/__init__.py b/monitoring/ceph-mixin/tests_dashboards/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/__init__.py diff --git a/monitoring/prometheus/tests/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/__init__.py similarity index 100% rename from monitoring/prometheus/tests/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/features/__init__.py diff --git a/monitoring/grafana/dashboards/tests/features/ceph-cluster.feature b/monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/ceph-cluster.feature rename to monitoring/ceph-mixin/tests_dashboards/features/ceph-cluster.feature diff --git a/monitoring/grafana/dashboards/tests/features/environment.py b/monitoring/ceph-mixin/tests_dashboards/features/environment.py similarity index 97% rename from monitoring/grafana/dashboards/tests/features/environment.py rename to monitoring/ceph-mixin/tests_dashboards/features/environment.py index 8509b9d97e8e2..5dc76a09e41d0 100644 --- a/monitoring/grafana/dashboards/tests/features/environment.py +++ b/monitoring/ceph-mixin/tests_dashboards/features/environment.py @@ -5,8 +5,8 @@ import copy from behave import given, then, when from prettytable import PrettyTable -from tests import PromqlTest -from tests.util import get_dashboards_data, resolve_time_and_unit +from tests_dashboards import PromqlTest +from tests_dashboards.util import get_dashboards_data, resolve_time_and_unit class GlobalContext: diff --git a/monitoring/grafana/dashboards/tests/features/host-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/host-details.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/host-details.feature rename to monitoring/ceph-mixin/tests_dashboards/features/host-details.feature diff --git a/monitoring/grafana/dashboards/tests/features/hosts_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/hosts_overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/hosts_overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/osd-device-details.feature b/monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/osd-device-details.feature rename to monitoring/ceph-mixin/tests_dashboards/features/osd-device-details.feature diff --git a/monitoring/grafana/dashboards/tests/features/osds-overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/osds-overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/osds-overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/radosgw-detail.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/radosgw-detail.feature rename to monitoring/ceph-mixin/tests_dashboards/features/radosgw-detail.feature diff --git a/monitoring/grafana/dashboards/tests/features/radosgw_overview.feature b/monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/radosgw_overview.feature rename to monitoring/ceph-mixin/tests_dashboards/features/radosgw_overview.feature diff --git a/monitoring/grafana/dashboards/tests/features/self.feature b/monitoring/ceph-mixin/tests_dashboards/features/self.feature similarity index 100% rename from monitoring/grafana/dashboards/tests/features/self.feature rename to monitoring/ceph-mixin/tests_dashboards/features/self.feature diff --git a/monitoring/grafana/dashboards/tests/features/steps/__init__.py b/monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py similarity index 100% rename from monitoring/grafana/dashboards/tests/features/steps/__init__.py rename to monitoring/ceph-mixin/tests_dashboards/features/steps/__init__.py diff --git a/monitoring/grafana/dashboards/tests/requirements.txt b/monitoring/ceph-mixin/tests_dashboards/requirements.txt similarity index 100% rename from monitoring/grafana/dashboards/tests/requirements.txt rename to monitoring/ceph-mixin/tests_dashboards/requirements.txt diff --git a/monitoring/grafana/dashboards/tests/util.py b/monitoring/ceph-mixin/tests_dashboards/util.py similarity index 95% rename from monitoring/grafana/dashboards/tests/util.py rename to monitoring/ceph-mixin/tests_dashboards/util.py index 4f09e9edd3b0f..b5872deafc990 100644 --- a/monitoring/grafana/dashboards/tests/util.py +++ b/monitoring/ceph-mixin/tests_dashboards/util.py @@ -22,7 +22,8 @@ def resolve_time_and_unit(time: str) -> Union[Tuple[int, str], Tuple[None, None] def get_dashboards_data() -> Dict[str, Any]: data: Dict[str, Any] = {'queries': {}, 'variables': {}, 'stats': {}} - for file in Path(__file__).parent.parent.glob('*.json'): + for file in Path(__file__).parent.parent \ + .joinpath('dashboards_out').glob('*.json'): with open(file, 'r') as f: dashboard_data = json.load(f) data['stats'][str(file)] = {'total': 0, 'tested': 0} diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini new file mode 100644 index 0000000000000..e15e17084f7ce --- /dev/null +++ b/monitoring/ceph-mixin/tox.ini @@ -0,0 +1,69 @@ +[tox] +envlist = lint,jsonnet-{check,lint,fix},promql-query-{test,lint},alerts-check +skipsdist = true + +[testenv:jsonnet-bundler-{install,update}] +whitelist_externals = + jb +description = + install: Install the jsonnet dependencies + update: Update the jsonnet dependencies +commands = + install: jb install + update: jb update + +[testenv:jsonnet-{check,fix,lint}] +basepython = python3 +whitelist_externals = + find + jb + jsonnet + jsonnetfm + sh +description = + check: Ensure that auto-generated files matches the current version + fix: Update generated files from jsonnet filse with latest changes + lint: Test if jsonnet files are linted (without any update) +deps = + -rrequirements-grafonnet.txt +depends = jsonnet-bundler-install +commands = + check: sh test-jsonnet.sh + lint: ./lint-jsonnet.sh --test + fix: jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +[testenv:lint] +description = + Run python linters +deps = + -rrequirements-lint.txt +setenv = +commands = + pylint --rcfile=.pylintrc tests_dashboards + mypy tests_dashboards + isort tests_dashboards + +[testenv:promql-query-test] +description = + lint: Run promtool check on grafana queries + test: Run promtool unit testing on grafana queries. +deps = + -rrequirements-lint.txt +depends = grafonnet-check +setenv = +whitelist_externals = + promtool +commands = + behave tests_dashboards/features + +[testenv:alerts-{check,lint}] +deps = + -rrequirements-alerts.txt + pytest +depends = grafonnet-check +whitelist_externals = + promtool +commands = + lint: promtool check rules prometheus_alerts.yaml + test: pytest -rA tests_alerts/test_syntax.py tests_alerts/test_unittests.py + python3 ./tests_alerts/validate_rules.py diff --git a/monitoring/grafana/README.md b/monitoring/grafana/README.md deleted file mode 100644 index b4bf4ec3273d0..0000000000000 --- a/monitoring/grafana/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Grafana dashboards for Ceph - -Here you can find a collection of [Grafana](https://grafana.com/grafana) -dashboards for Ceph Monitoring. These dashboards are based on metrics collected -from [prometheus](https://prometheus.io/) scraping the [prometheus mgr -plugin](http://docs.ceph.com/en/latest/mgr/prometheus/) and the -[node_exporter](https://github.com/prometheus/node_exporter). - -### Other requirements - -- Luminous 12.2.5 or newer -- [Status Panel](https://grafana.com/plugins/vonage-status-panel) installed -- node_exporter 0.15.x and 0.16.x are supported (host details and hosts -overview dashboards) diff --git a/monitoring/grafana/build/Makefile b/monitoring/grafana/build/Makefile index f1b46cdd7fdd0..64f61999026c8 100755 --- a/monitoring/grafana/build/Makefile +++ b/monitoring/grafana/build/Makefile @@ -2,7 +2,7 @@ GRAFANA_VERSION ?= 8.3.5-1 PIECHART_VERSION ?= "1.6.2" STATUS_PANEL_VERSION ?= "1.0.11" -DASHBOARD_DIR := "../dashboards" +DASHBOARD_DIR := "../../ceph-mixin/dashboards_out" DASHBOARD_PROVISIONING := "ceph-dashboard.yml" IMAGE := "docker.io/redhat/ubi8:8.5" PKGMGR := "dnf" diff --git a/monitoring/grafana/dashboards/.pylintrc b/monitoring/grafana/dashboards/.pylintrc deleted file mode 120000 index aa04b020cb4c1..0000000000000 --- a/monitoring/grafana/dashboards/.pylintrc +++ /dev/null @@ -1 +0,0 @@ -../../../src/pybind/mgr/dashboard/.pylintrc \ No newline at end of file diff --git a/monitoring/grafana/dashboards/CMakeLists.txt b/monitoring/grafana/dashboards/CMakeLists.txt deleted file mode 100644 index 51ef18e4376ab..0000000000000 --- a/monitoring/grafana/dashboards/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -set(CEPH_GRAFANA_DASHBOARDS_DIR "${CMAKE_INSTALL_SYSCONFDIR}/grafana/dashboards/ceph-dashboard" - CACHE PATH "Location for grafana dashboards") -file(GLOB CEPH_GRAFANA_DASHBOARDS "*.json") -install(FILES - ${CEPH_GRAFANA_DASHBOARDS} - DESTINATION ${CEPH_GRAFANA_DASHBOARDS_DIR}) - -set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR}) -if(NOT CEPH_BUILD_VIRTUALENV) - set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR}) -endif() - -if(WITH_GRAFANA) - if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm|ARM") - include(AddCephTest) - add_tox_test(grafana-check TOX_ENVS grafonnet-check) - add_tox_test(grafana-query-test TOX_ENVS promql-query-test) - add_tox_test(grafana-lint TOX_ENVS lint) - set(ver 0.1.0) - set(name grafonnet-lib) - include(ExternalProject) - ExternalProject_Add(${name} - URL https://github.com/grafana/${name}/archive/v${ver}/${name}-${ver}.tar.gz - URL_MD5 0798752ed40864fa8b3db40a3c970642 - BUILD_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - EXCLUDE_FROM_ALL ON) - add_dependencies(tests - ${name}) - ExternalProject_Get_Property(${name} SOURCE_DIR) - set_property( - TEST run-tox-grafana-check run-tox-grafana-query-test run-tox-grafana-lint - APPEND - PROPERTY ENVIRONMENT - GRAFONNET_PATH=${SOURCE_DIR}/grafonnet) - endif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm|ARM") -endif() diff --git a/monitoring/grafana/dashboards/README b/monitoring/grafana/dashboards/README deleted file mode 100644 index 3803cd7a00422..0000000000000 --- a/monitoring/grafana/dashboards/README +++ /dev/null @@ -1,28 +0,0 @@ -Context -These dashboards should be enough to get started on the integration. It's not a complete set, so more will be added in the next week. - -Bare in mind that the osd device details dashboard needs node_exporter active - all the other dashboards pick data out of ceph-mgr based metrics. - - -The cephfs dashboard only has 2 panels currently. The counter available are -a little light at the moment. Patrick/Venky have been addressing this with -https://bugzilla.redhat.com/show_bug.cgi?id=1618523 -cephfs-overview.json - -Host Information -host-details.json combines generic server metrics that show cpu/memory/network stats (including network errors/drops), -with disk level stats for OSD hosts. OSD charts show the physical device name together with it's corresponding osd id for correlation. - -Ceph Pools -two dashboards. Overview gives the high level combined view, pool-detail needs a pool_name variable passed to it (currently uses a templating var which is visible) -pool-overview.json -pool-detail.json - -OSD Device Details. This dashboard needs some further work. It currently shows -OSD level stats with physical device stats but leaves out some of the counters -that cephmetrics provides for trouble shooting. -osd-device-details.json - -Object gateway dashboards, again split into overview and detail. The detail dashboard needs the relevant ceph-deamon name for the rgw instance. -radosgw-overview.json -radosgw-detail.json diff --git a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet b/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet deleted file mode 100644 index 8b6935ab17ec4..0000000000000 --- a/monitoring/grafana/dashboards/jsonnet/grafana_dashboards.jsonnet +++ /dev/null @@ -1,1510 +0,0 @@ -local g = import 'grafana.libsonnet'; - -local dashboardSchema(title, description, uid, time_from, refresh, schemaVersion, tags, timezone, timepicker) = - g.dashboard.new(title=title, description=description, uid=uid, time_from=time_from, refresh=refresh, schemaVersion=schemaVersion, tags=tags, timezone=timezone, timepicker=timepicker); - -local graphPanelSchema(aliasColors, title, description, nullPointMode, stack, formatY1, formatY2, labelY1, labelY2, min, fill, datasource, legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) = - g.graphPanel.new(aliasColors=aliasColors, title=title, description=description, nullPointMode=nullPointMode, stack=stack, formatY1=formatY1, formatY2=formatY2, labelY1=labelY1, labelY2=labelY2, min=min, fill=fill, datasource=datasource, legend_alignAsTable=legend_alignAsTable, legend_avg=legend_avg, legend_min=legend_min, legend_max=legend_max, legend_current=legend_current, legend_values=legend_values); - -local addTargetSchema(expr, intervalFactor, format, legendFormat) = - g.prometheus.target(expr=expr, intervalFactor=intervalFactor, format=format, legendFormat=legendFormat); - -local addTemplateSchema(name, datasource, query, refresh, includeAll, sort, label, regex) = - g.template.new(name=name, datasource=datasource, query=query, refresh=refresh, includeAll=includeAll, sort=sort, label=label, regex=regex); - -local addAnnotationSchema(builtIn, datasource, enable, hide, iconColor, name, type) = - g.annotation.datasource(builtIn=builtIn, datasource=datasource, enable=enable, hide=hide, iconColor=iconColor, name=name, type=type); - -local addRowSchema(collapse, showTitle, title) = - g.row.new(collapse=collapse, showTitle=showTitle, title=title); - -local addSingelStatSchema(colors, datasource, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparklineShow, thresholds) = - g.singlestat.new(colors=colors, datasource=datasource, format=format, title=title, description=description, valueName=valueName, colorValue=colorValue, gaugeMaxValue=gaugeMaxValue, gaugeShow=gaugeShow, sparklineShow=sparklineShow, thresholds=thresholds); - -local addPieChartSchema(aliasColors, datasource, description, legendType, pieType, title, valueName) = - g.pieChartPanel.new(aliasColors=aliasColors, datasource=datasource, description=description, legendType=legendType, pieType=pieType, title=title, valueName=valueName); - -local addTableSchema(datasource, description, sort, styles, title, transform) = - g.tablePanel.new(datasource=datasource, description=description, sort=sort, styles=styles, title=title, transform=transform); - -local addStyle(alias, colorMode, colors, dateFormat, decimals, mappingType, pattern, thresholds, type, unit, valueMaps) = - {'alias': alias, 'colorMode': colorMode, 'colors':colors, 'dateFormat':dateFormat, 'decimals':decimals, 'mappingType':mappingType, 'pattern':pattern, 'thresholds':thresholds, 'type':type, 'unit':unit, 'valueMaps':valueMaps}; - -{ - "hosts-overview.json": - local HostsOverviewSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local HostsOverviewGraphPanel(title, description, formatY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, description, 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Host Overview', '', 'y0KGL0iZz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('osd_hosts', '$datasource', 'label_values(ceph_disk_occupation, exported_instance)', 1, true, 1, null, '([^.]*).*') - ) - .addTemplate( - addTemplateSchema('mon_hosts', '$datasource', 'label_values(ceph_mon_metadata, ceph_daemon)', 1, true, 1, null, 'mon.(.*)') - ) - .addTemplate( - addTemplateSchema('mds_hosts', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, null, 'mds.(.*)') - ) - .addTemplate( - addTemplateSchema('rgw_hosts', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, null, 'rgw.(.*)') - ) - .addPanels([ - HostsOverviewSingleStatPanel( - 'none', - 'OSD Hosts', - '', - 'current', - 'count(sum by (hostname) (ceph_osd_metadata))', - 'time_series', - 0, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG CPU Busy', - 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', - 'current', - 'avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]))\n )\n )', - 'time_series', - 4, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percentunit', - 'AVG RAM Utilization', - 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', - 'current', - 'avg (((node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})- (\n (node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + \n (node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +\n (node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"})\n )) /\n (node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} ))', - 'time_series', - 8, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'none', - 'Physical IOPS', - 'IOPS Load at the device as reported by the OS on all OSD hosts', - 'current', - 'sum ((irate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[5m]) or irate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[5m])))', - 'time_series', - 12, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'percent', - 'AVG Disk Utilization', - 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', - 'current', - 'avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), "instance", "$1", "instance", "([^.:]*).*"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($osd_hosts).*"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^.:]*).*")\n)', - 'time_series', - 16, 0, 4, 5 - ), - HostsOverviewSingleStatPanel( - 'bytes', - 'Network Load', - 'Total send/receive network load across all hosts in the ceph cluster', - 'current', - 'sum (\n\t(\n\t\tirate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or\n\t\tirate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")\n) +\nsum (\n\t(\n\t\tirate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m]) or\n\t\tirate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")\n\t)\n' - , 'time_series', - 20, 0, 4, 5 - ), - HostsOverviewGraphPanel( - 'CPU Busy - Top 10 Hosts', - 'Show the top 10 busiest hosts by cpu', - 'percent', - 'topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]) or\n irate(node_cpu{mode=\'idle\',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[1m]))\n )\n )\n)', - '{{instance}}', - 0, 5, 12, 9 - ), - HostsOverviewGraphPanel( - 'Network Load - Top 10 Hosts', - 'Top 10 hosts by network load', - 'Bps', - 'topk(10, (sum by(instance) (\n(\n\tirate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or\n\tirate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m])\n) +\n(\n\tirate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m]) or\n\tirate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[1m])\n) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))\n))\n' - , - '{{instance}}', - 12, 5, 12, 9 - ), - ]) -} -{ - "host-details.json": - local HostDetailsSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local HostDetailsGraphPanel(alias, title, description, nullPointMode, formatY1, labelY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema(alias, title, description, nullPointMode, false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Host Details', '', 'rtOg0AiWz', 'now-1h', '10s', 16, ['overview'], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('ceph_hosts', '$datasource', 'label_values(node_scrape_collector_success, instance) ', 1, false, 3, 'Hostname', '([^.:]*).*') - ) - .addPanels([ - addRowSchema(false, true, '$ceph_hosts System Overview') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - HostDetailsSingleStatPanel( - 'none', - 'OSDs', - '', - 'current', - 'count(sum by (ceph_daemon) (ceph_osd_metadata{hostname=\'$ceph_hosts\'}))', - 'time_series', - 0, 1, 3, 5 - ), - HostDetailsGraphPanel( - {"interrupt": "#447EBC","steal": "#6D1F62","system": "#890F02","user": "#3F6833","wait": "#C15C17"}, - 'CPU Utilization', - 'Shows the CPU breakdown. When multiple servers are selected, only the first host\'s cpu data is shown', - 'null', - 'percent', - '% Utilization', - 'sum by (mode) (\n irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[1m])\n) / scalar(\n sum(irate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]) or\n irate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[1m]))\n) * 100', - '{{mode}}', - 3, 1, 6, 10 - ), - HostDetailsGraphPanel( - {"Available": "#508642","Free": "#508642","Total": "#bf1b00","Used": "#bf1b00","total": "#bf1b00","used": "#0a50a1"}, - 'RAM Usage', - '', - 'null', - 'bytes', - 'RAM used', - 'node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', - 'Free', - 9, 1, 6, 10) - .addTargets( - [ - addTargetSchema( - 'node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} ', - 1, - 'time_series', - 'total' - ), - addTargetSchema( - '(node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n(node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n(node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) \n', - 1, - 'time_series', - 'buffers/cache' - ), - addTargetSchema( - '(node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})- (\n (node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) + \n (node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}) +\n (node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"})\n )\n \n', - 1, - 'time_series', - 'used' - )]) - .addSeriesOverride({"alias": "total","color": "#bf1b00","fill": 0,"linewidth": 2,"stack": false} - ), - HostDetailsGraphPanel( - {}, - 'Network Load', - 'Show the network load (rx,tx) across all interfaces (excluding loopback \'lo\')', - 'null', - 'decbytes', - 'Send (-) / Receive (+)', - 'sum by (device) (\n irate(node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or \n irate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', - '{{device}}.rx', - 15, 1, 6, 10 - ) - .addTargets( - [ - addTargetSchema( - 'sum by (device) (\n irate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m]) or\n irate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[1m])\n)', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - 'Network drop rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - 'irate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - '{{device}}.rx', - 21, 1, 3, 5 - ) - .addTargets( - [ - addTargetSchema( - 'irate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - HostDetailsSingleStatPanel( - 'bytes', - 'Raw Capacity', - 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', - 'current', - 'sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~"($ceph_hosts)([\\\\.:].*)?"})', - 'time_series', - 0, 6, 3, 5 - ), - HostDetailsGraphPanel( - {}, - 'Network error rate', - '', - 'null', - 'pps', - 'Send (-) / Receive (+)', - 'irate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - '{{device}}.rx', - 21, 6, 3, 5 - ) - .addTargets( - [ - addTargetSchema( - 'irate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m]) or irate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[1m])', - 1, - 'time_series', - '{{device}}.tx' - )]) - .addSeriesOverride({"alias": "/.*tx/","transform": "negative-Y"} - ), - addRowSchema(false, true, 'OSD Disk Performance Statistics') + {gridPos: {x: 0, y: 11, w: 24, h: 1}}, - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk IOPS', - 'For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it\'s name and corresponding OSD id value', - 'connected', - 'ops', - 'Read (-) / Write (+)', - 'label_replace(\n (\n irate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', - '{{device}}({{ceph_daemon}}) writes', - 0, 12, 11, 9 - ) - .addTargets( - [ - addTargetSchema( - 'label_replace(\n (irate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n "device",\n "$1",\n "device",\n "/dev/(.*)"\n ),\n "instance",\n "$1",\n "instance",\n "([^:.]*).*"\n )', - 1, - 'time_series', - '{{device}}({{ceph_daemon}}) reads' - )]) - .addSeriesOverride({"alias": "/.*reads/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Throughput by Disk', - 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', - 'connected', - 'Bps', - 'Read (-) / Write (+)', - 'label_replace((irate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}}) write', - 12, 12, 11, 9 - ) - .addTargets( - [ - addTargetSchema( - 'label_replace((irate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) or irate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m])), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 1, - 'time_series', - '{{device}}({{ceph_daemon}}) read' - )]) - .addSeriesOverride({"alias": "/.*read/","transform": "negative-Y"} - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk Latency', - 'For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it\'s corresponding OSD id', - 'null as zero', - 's', - '', - 'max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]), 0.001), "instance", "$1", "instance", "([^:.]*).*")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}})', - 0, 21, 11, 9 - ), - HostDetailsGraphPanel( - {}, - '$ceph_hosts Disk utilization', - 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', - 'connected', - 'percent', - '%Util', - 'label_replace(((irate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[5m]) * 100), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}}({{ceph_daemon}})', - 12, 21, 11, 9 - ) - ]) -} -{ - "radosgw-sync-overview.json": - local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) = - graphPanelSchema({}, title, '', 'null as zero', true, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric, 1, 'time_series', '{{source_zone}}')]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Sync Overview', '', 'rgw-sync-overview', 'now-1h', '15s', 16, ["overview"], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - RgwSyncOverviewPanel( - 'Replication (throughput) from Source Zone', - 'Bps', - null, - 'ceph_data_sync_from_zone_fetch_bytes_sum', - 0, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Replication (objects) from Source Zone', - 'short', - 'Objects/s', - 'ceph_data_sync_from_zone_fetch_bytes_count', - 8, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Polling Request Latency from Source Zone', - 'ms', - null, - 'ceph_data_sync_from_zone_poll_latency_sum', - 16, 0, 8, 7 - ), - RgwSyncOverviewPanel( - 'Unsuccessful Object Replications from Source Zone', - 'short', - 'Count/s', - 'ceph_data_sync_from_zone_fetch_errors', - 0, 7, 8, 7 - ) - ]) -} -{ - "radosgw-overview.json": - local RgwOverviewPanel(title, description, formatY1, formatY2, expr1, legendFormat1, x, y, w, h, datasource='$datasource', legend_alignAsTable=false, legend_avg=false, legend_min=false, legend_max=false, legend_current=false, legend_values=false) = - graphPanelSchema({}, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, datasource, legend_alignAsTable, legend_avg, legend_min, legend_max, legend_current, legend_values) - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Overview', '', 'WAkugZpiz', 'now-1h', '15s', 16, ['overview'], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') - ) - .addTemplate( - addTemplateSchema('code', '$datasource', 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)', 1, true, 1, 'HTTP Code', '') - ) - .addTemplate( - addTemplateSchema('ingress_service', '$datasource', 'label_values(haproxy_server_status, instance)', 1, true, 1, 'Ingress Service', '') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - addRowSchema(false, true, 'RGW Overview - All Gateways') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - RgwOverviewPanel( - 'Average GET/PUT Latencies', - '', - 's', - 'short', - 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', - 'GET AVG', - 0, 1, 8, 7 - ) - .addTargets( - [ - addTargetSchema( - 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata', - 1, - 'time_series', - 'PUT AVG' - )]), - RgwOverviewPanel( - 'Total Requests/sec by RGW Instance', - '', - 'none', - 'short', - 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))', - '{{rgw_host}}', - 8, 1, 7, 7 - ), - RgwOverviewPanel( - 'GET Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts', - 's', - 'short', - 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 15, 1, 6, 7 - ), - RgwOverviewPanel( - 'Bandwidth Consumed by Type', - 'Total bytes transferred in/out of all radosgw instances within the cluster', - 'bytes', - 'short', - 'sum(rate(ceph_rgw_get_b[30s]))', - 'GETs', - 0, 8, 8, 6 - ) - .addTargets( - [ - addTargetSchema( - 'sum(rate(ceph_rgw_put_b[30s]))', - 1, - 'time_series', - 'PUTs' - )]), - RgwOverviewPanel( - 'Bandwidth by RGW Instance', - 'Total bytes transferred in/out through get/put operations, by radosgw instance', - 'bytes', - 'short', - 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 8, 8, 7, 6 - ), - RgwOverviewPanel( - 'PUT Latencies by RGW Instance', - 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts', - 's', - 'short', - 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")', - '{{rgw_host}}', - 15, 8, 6, 6 - ), - addRowSchema(false, true, 'RGW Overview - HAProxy Metrics') + {gridPos: {x: 0, y: 12, w: 9, h: 12}}, - RgwOverviewPanel( - 'Total responses by HTTP code', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)', - 'Frontend {{ code }}', - 0, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema( - 'sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', - 1, - 'time_series', - 'Backend {{ code }}' - )]) - .addSeriesOverride([ - { "alias": "/.*Back.*/", - "transform": "negative-Y" }, - { "alias": "/.*1.*/" }, - { "alias": "/.*2.*/" }, - { "alias": "/.*3.*/" }, - { "alias": "/.*4.*/" }, - { "alias": "/.*5.*/" }, - { "alias": "/.*other.*/" } - ]), - RgwOverviewPanel( - 'Total requests / responses', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', - 'Requests', - 5, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'), - addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'), - addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'), - addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'), - addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'), - addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'), - ]) - .addSeriesOverride([ - { - "alias": "/.*Response.*/", - "transform": "negative-Y" - }, - { - "alias": "/.*Backend.*/", - "transform": "negative-Y" - } - ]), - RgwOverviewPanel( - 'Total number of connections', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', - 'Front', - 10, 12, 5, 12, - '$datasource', - true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'), - addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'), - ]) - .addSeriesOverride([ - { - "alias": "/.*Back.*/", - "transform": "negative-Y" - } - ]), - RgwOverviewPanel( - 'Current total of incoming / outgoing bytes', - '', - 'short', - 'short', - 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 'IN Front', 15, 12, 6, 12, '$datasource', true, true, true, true, true, true) - .addTargets( - [ - addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'), - addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'), - addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back') - ]) - .addSeriesOverride([ - { - "alias": "/.*OUT.*/", - "transform": "negative-Y" - } - ]) - ]) -} -{ - "radosgw-detail.json": - local RgwDetailsPanel(aliasColors, title, description, formatY1, formatY2, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema(aliasColors, title, description, 'null', false, formatY1, formatY2, null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RGW Instance Detail', '', 'x5ARzZtmk', 'now-1h', '15s', 16, ['overview'], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('rgw_servers', '$datasource', 'label_values(ceph_rgw_metadata, ceph_daemon)', 1, true, 1, '', '') - ) - .addPanels([ - addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - RgwDetailsPanel( - {}, - '$rgw_servers GET/PUT Latencies', - '', - 's', - 'short', - 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'GET {{ceph_daemon}}', - 'PUT {{ceph_daemon}}', - 0, 1, 6, 8 - ), - RgwDetailsPanel( - {}, - 'Bandwidth by HTTP Operation', - '', - 'bytes', - 'short', - 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'GETs {{ceph_daemon}}', - 'PUTs {{ceph_daemon}}', - 6, 1, 7, 8 - ), - RgwDetailsPanel( - {"GETs": "#7eb26d","Other": "#447ebc","PUTs": "#eab839","Requests": "#3f2b5b","Requests Failed": "#bf1b00"}, - 'HTTP Request Breakdown', - '', - 'short', - 'short', - 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 'Requests Failed {{ceph_daemon}}', - 'GETs {{ceph_daemon}}', - 13, 1, 7, 8 - ) - .addTargets( - [ - addTargetSchema( - 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 1, - 'time_series', - 'PUTs {{ceph_daemon}}' - ), - addTargetSchema( - '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', - 1, - 'time_series', - 'Other {{ceph_daemon}}' - )]), - addPieChartSchema( - {"GETs": "#7eb26d","Other (HEAD,POST,DELETE)": "#447ebc","PUTs": "#eab839","Requests": "#3f2b5b","Failures": "#bf1b00"}, - '$datasource', - '', - 'Under graph', - 'pie', - 'Workload Breakdown', - 'current' - ) - .addTarget(addTargetSchema('rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Failures {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'GETs {{ceph_daemon}}')) - .addTarget(addTargetSchema('rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'PUTs {{ceph_daemon}}')) - .addTarget(addTargetSchema('(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}', 1, 'time_series', 'Other (DELETE,LIST) {{ceph_daemon}}')) + {gridPos: {x: 20, y: 1, w: 4, h: 8}} - ]) -} -{ - "rbd-details.json": - local RbdDetailsPanel(title, formatY1, expr1, expr2, x, y, w, h) = - graphPanelSchema({}, title, '', 'null as zero', false, formatY1, formatY1, null, null, 0, 1, '$Datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', 'Write'),addTargetSchema(expr2, 1, 'time_series', 'Read')]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RBD Details', 'Detailed Performance of RBD Images (IOPS/Throughput/Latency)', 'YhCYGcuZz', 'now-1h', false, 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('Datasource', 'prometheus', 'default', label=null) - ) - .addTemplate( - addTemplateSchema('Pool', '$Datasource', 'label_values(pool)', 1, false, 0, '', '') - ) - .addTemplate( - addTemplateSchema('Image', '$Datasource', 'label_values(image)', 1, false, 0, '', '') - ) - .addPanels([ - RbdDetailsPanel( - 'IOPS', - 'iops', - 'irate(ceph_rbd_write_ops{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_ops{pool="$Pool", image="$Image"}[30s])', - 0, 0, 8, 9 - ), - RbdDetailsPanel( - 'Throughput', - 'Bps', - 'irate(ceph_rbd_write_bytes{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_bytes{pool="$Pool", image="$Image"}[30s])', - 8, 0, 8, 9 - ), - RbdDetailsPanel( - 'Average Latency', - 'ns', - 'irate(ceph_rbd_write_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_write_latency_count{pool="$Pool", image="$Image"}[30s])', - 'irate(ceph_rbd_read_latency_sum{pool="$Pool", image="$Image"}[30s]) / irate(ceph_rbd_read_latency_count{pool="$Pool", image="$Image"}[30s])', - 16, 0, 8, 9 - ) - ]) -} -{ - "rbd-overview.json": - local RgwOverviewStyle(alias, pattern, type, unit) = - addStyle(alias, null, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); - local RbdOverviewPanel(title, formatY1, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema({}, title, '', 'null', false, formatY1, 'short', null, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'RBD Overview', '', '41FrpeUiz', 'now-1h', '30s', 16, ["overview"], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.4.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='datasource', id='prometheus', name='Prometheus', version='5.0.0' - ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - RbdOverviewPanel( - 'IOPS', - 'short', - 'round(sum(irate(ceph_rbd_write_ops[30s])))', - 'round(sum(irate(ceph_rbd_read_ops[30s])))', - 'Writes', - 'Reads', - 0, 0, 8, 7 - ), - RbdOverviewPanel( - 'Throughput', - 'Bps', - 'round(sum(irate(ceph_rbd_write_bytes[30s])))', - 'round(sum(irate(ceph_rbd_read_bytes[30s])))', - 'Write', - 'Read', - 8, 0, 8, 7 - ), - RbdOverviewPanel( - 'Average Latency', - 'ns', - 'round(sum(irate(ceph_rbd_write_latency_sum[30s])) / sum(irate(ceph_rbd_write_latency_count[30s])))', - 'round(sum(irate(ceph_rbd_read_latency_sum[30s])) / sum(irate(ceph_rbd_read_latency_count[30s])))', - 'Write', - 'Read', - 16, 0, 8, 7 - ), - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('IOPS', 'Value', 'number', 'iops'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest IOPS', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10, (sort((irate(ceph_rbd_write_ops[30s]) + on (image, pool, namespace) irate(ceph_rbd_read_ops[30s])))))', - 1, - 'table', - '') - ) + {gridPos: {x: 0, y: 7, w: 8, h: 7}}, - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('Throughput', 'Value', 'number', 'Bps'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest Throughput', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10, sort(sum(irate(ceph_rbd_read_bytes[30s]) + irate(ceph_rbd_write_bytes[30s])) by (pool, image, namespace)))', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 8, y: 7, w: 8, h: 7}}, - addTableSchema( - '$datasource', - '', - {"col": 3,"desc": true}, - [ - RgwOverviewStyle('Pool', 'pool', 'string', 'short'),RgwOverviewStyle('Image', 'image', 'string', 'short'),RgwOverviewStyle('Latency', 'Value', 'number', 'ns'), RgwOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest Latency', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n sum(\n irate(ceph_rbd_write_latency_sum[30s]) / clamp_min(irate(ceph_rbd_write_latency_count[30s]), 1) +\n irate(ceph_rbd_read_latency_sum[30s]) / clamp_min(irate(ceph_rbd_read_latency_count[30s]), 1)\n ) by (pool, image, namespace)\n)', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 16, y: 7, w: 8, h: 7}} - ]) -} -{ - "pool-overview.json": - local PoolOverviewSingleStatPanel(format, title, description, valueName, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, false, 100, false, false, '') - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local PoolOverviewStyle(alias, pattern, type, unit, colorMode, thresholds, valueMaps) = - addStyle(alias, colorMode, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, thresholds, type, unit, valueMaps); - - local PoolOverviewGraphPanel(title, description, formatY1, labelY1, expr, targetFormat, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, description, 'null as zero', false, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Ceph Pools Overview', '', 'z99hzWtmk', 'now-1h', '15s', 22, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'Dashboard1', label='Data Source') - ) - .addTemplate( - g.template.custom(label='TopK', name='topk', current='15', query='15') - ) - .addPanels([ - PoolOverviewSingleStatPanel( - 'none', - 'Pools', - '', - 'avg', - 'count(ceph_pool_metadata)', - 'table', - 0, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Pools with Compression', - 'Count of the pools that have compression enabled', 'current', 'count(ceph_pool_metadata{compression_mode!="none"})', - '', - 3, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Total Raw Capacity', - 'Total raw capacity available to the cluster', - 'current', - 'sum(ceph_osd_stat_bytes)', - '', - 6, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Raw Capacity Consumed', - 'Total raw capacity consumed by user data and associated overheads (metadata + redundancy)', - 'current', - 'sum(ceph_pool_bytes_used)', - '', - 9, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Logical Stored ', - 'Total of client data stored in the cluster', - 'current', - 'sum(ceph_pool_stored)', - '', - 12, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'bytes', - 'Compression Savings', - 'A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression', - 'current', - 'sum(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used)', - '', - 15, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'percent', - 'Compression Eligibility', - 'Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data\n', - 'current', - '(sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_stored_raw and ceph_pool_compress_under_bytes > 0)) * 100', - 'table', - 18, 0, 3, 3 - ), - PoolOverviewSingleStatPanel( - 'none', - 'Compression Factor', - 'This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)', - 'current', - 'sum(ceph_pool_compress_under_bytes > 0) / sum(ceph_pool_compress_bytes_used > 0)', - '', - 21, 0, 3, 3 - ), - addTableSchema( - '$datasource', '', {"col": 5,"desc": true}, [PoolOverviewStyle('', 'Time', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'instance', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'job', 'hidden', 'short', null, [], []),PoolOverviewStyle('Pool Name', 'name', 'string', 'short', null, [], []),PoolOverviewStyle('Pool ID', 'pool_id', 'hidden', 'none', null, [], []),PoolOverviewStyle('Compression Factor', 'Value #A', 'number', 'none', null, [], []),PoolOverviewStyle('% Used', 'Value #D', 'number', 'percentunit', 'value', ['70','85'], []),PoolOverviewStyle('Usable Free', 'Value #B', 'number', 'bytes', null, [], []),PoolOverviewStyle('Compression Eligibility', 'Value #C', 'number', 'percent', null, [], []),PoolOverviewStyle('Compression Savings', 'Value #E', 'number', 'bytes', null, [], []),PoolOverviewStyle('Growth (5d)', 'Value #F', 'number', 'bytes', 'value', ['0', '0'], []),PoolOverviewStyle('IOPS', 'Value #G', 'number', 'none', null, [], []),PoolOverviewStyle('Bandwidth', 'Value #H', 'number', 'Bps', null, [], []),PoolOverviewStyle('', '__name__', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'type', 'hidden', 'short', null, [], []),PoolOverviewStyle('', 'compression_mode', 'hidden', 'short', null, [], []),PoolOverviewStyle('Type', 'description', 'string', 'short', null, [], []),PoolOverviewStyle('Stored', 'Value #J', 'number', 'bytes', null, [], []),PoolOverviewStyle('', 'Value #I', 'hidden', 'short', null, [], []),PoolOverviewStyle('Compression', 'Value #K', 'string', 'short', null, [], [{"text": "ON","value": "1"}])], 'Pool Overview', 'table' - ) - .addTargets( - [addTargetSchema('(ceph_pool_compress_under_bytes / ceph_pool_compress_bytes_used > 0) and on(pool_id) (((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100 > 0.5)', 1, 'table', ''), - addTargetSchema('ceph_pool_max_avail * on(pool_id) group_left(name) ceph_pool_metadata', 1, 'table', ''), - addTargetSchema('((ceph_pool_compress_under_bytes > 0) / ceph_pool_stored_raw) * 100', 1, 'table', ''), - addTargetSchema('(ceph_pool_percent_used * on(pool_id) group_left(name) ceph_pool_metadata)', 1, 'table', ''), - addTargetSchema('(ceph_pool_compress_under_bytes - ceph_pool_compress_bytes_used > 0)', 1, 'table', ''), - addTargetSchema('delta(ceph_pool_stored[5d])', 1, 'table', ''), - addTargetSchema('rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])', 1, 'table', ''), - addTargetSchema('rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])', 1, 'table', ''), - addTargetSchema('ceph_pool_metadata', 1, 'table', ''), - addTargetSchema('ceph_pool_stored * on(pool_id) group_left ceph_pool_metadata', 1, 'table', ''), - addTargetSchema('ceph_pool_metadata{compression_mode!="none"}', 1, 'table', ''), - addTargetSchema('', '', '', '')] - ) + {gridPos: {x: 0, y: 3, w: 24, h: 6}}, - PoolOverviewGraphPanel( - 'Top $topk Client IOPS by Pool', - 'This chart shows the sum of read and write IOPS from all clients by pool', - 'short', - 'IOPS', - 'topk($topk,round((rate(ceph_pool_rd[30s]) + rate(ceph_pool_wr[30s])),1) * on(pool_id) group_left(instance,name) ceph_pool_metadata) ', - 'time_series', - '{{name}} ', - 0, 9, 12, 8 - ) - .addTarget( - addTargetSchema( - 'topk($topk,rate(ceph_pool_wr[30s]) + on(pool_id) group_left(instance,name) ceph_pool_metadata) ', - 1, - 'time_series', - '{{name}} - write') - ), - PoolOverviewGraphPanel( - 'Top $topk Client Bandwidth by Pool', - 'The chart shows the sum of read and write bytes from all clients, by pool', - 'Bps', - 'Throughput', - 'topk($topk,(rate(ceph_pool_rd_bytes[30s]) + rate(ceph_pool_wr_bytes[30s])) * on(pool_id) group_left(instance,name) ceph_pool_metadata)', - 'time_series', - '{{name}}', - 12, 9, 12, 8 - ), - PoolOverviewGraphPanel( - 'Pool Capacity Usage (RAW)', - 'Historical view of capacity usage, to help identify growth and trends in pool consumption', - 'bytes', - 'Capacity Used', - 'ceph_pool_bytes_used * on(pool_id) group_right ceph_pool_metadata', - '', - '{{name}}', - 0, 17, 24, 7 - ) - ]) -} -{ - "pool-detail.json": - local PoolDetailSingleStatPanel(format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(['#299c46','rgba(237, 129, 40, 0.89)','#d44a3a'], '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - local PoolDetailGraphPanel(alias, title, description, formatY1, labelY1, expr, targetFormat, legendFormat, x, y, w, h) = - graphPanelSchema(alias, title, description, 'null as zero', false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'Ceph Pool Details', '', '-xyV8KCiz', 'now-1h', '15s', 22, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='singlestat', name='Singlestat', version='5.0.0' - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'Prometheus admin.virt1.home.fajerski.name:9090', label='Data Source') - ) - .addTemplate( - addTemplateSchema('pool_name', '$datasource', 'label_values(ceph_pool_metadata,name)', 1, false, 1, 'Pool Name', '') - ) - .addPanels([ - PoolDetailSingleStatPanel( - 'percentunit', - 'Capacity used', - '', - 'current', - true, 1, true, true, - '.7,.8', - '(ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 0, 0, 7, 7 - ), - PoolDetailSingleStatPanel( - 's', - 'Time till full', - 'Time till pool is full assuming the average fill rate of the last 6 hours', - false, 100, false, false, - '', - 'current', - '(ceph_pool_max_avail / deriv(ceph_pool_stored[6h])) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"} > 0', - 'time_series', - 7, 0, 5, 7 - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Object Ingress/Egress', - '', - 'ops', - 'Objects out(-) / in(+) ', - 'deriv(ceph_pool_objects[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'Objects per second', - 12, 0, 12, 7 - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Client IOPS', - '', - 'iops', - 'Read (-) / Write (+)', - 'irate(ceph_pool_rd[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'reads', - 0, 7, 12, 7 - ) - .addSeriesOverride({"alias": "reads","transform": "negative-Y"}) - .addTarget( - addTargetSchema( - 'irate(ceph_pool_wr[1m]) * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 1, - 'time_series', - 'writes' - ) - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Client Throughput', - '', - 'Bps', 'Read (-) / Write (+)', - 'irate(ceph_pool_rd_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'reads', - 12, 7, 12, 7 - ) - .addSeriesOverride({"alias": "reads","transform": "negative-Y"}) - .addTarget( - addTargetSchema( - 'irate(ceph_pool_wr_bytes[1m]) + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 1, - 'time_series', - 'writes' - ) - ), - PoolDetailGraphPanel( - {"read_op_per_sec": "#3F6833","write_op_per_sec": "#E5AC0E"}, - '$pool_name Objects', - '', - 'short', - 'Objects', - 'ceph_pool_objects * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~"$pool_name"}', - 'time_series', - 'Number of Objects', - 0, 14, 12, 7 - ) - ]) -} -{ - "osds-overview.json": - local OsdOverviewStyle(alias, pattern, type, unit) = - addStyle(alias, null, ["rgba(245, 54, 54, 0.9)","rgba(237, 129, 40, 0.89)","rgba(50, 172, 45, 0.97)"], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []); - local OsdOverviewGraphPanel(alias, title, description, formatY1, labelY1, min, expr, legendFormat1, x, y, w, h) = - graphPanelSchema(alias, title, description, 'null', false, formatY1, 'short', labelY1, null, min, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat1)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - local OsdOverviewPieChartPanel(alias, description, title) = - addPieChartSchema(alias, '$datasource', description, 'Under graph', 'pie', title, 'current'); - local OsdOverviewSingleStatPanel(colors, format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds, expr, targetFormat, x, y, w, h) = - addSingelStatSchema(colors, '$datasource', format, title, description, valueName, colorValue, gaugeMaxValue, gaugeShow, sparkLineShow, thresholds) - .addTarget(addTargetSchema(expr, 1, targetFormat, '')) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'OSD Overview', '', 'lo02I1Aiz', 'now-1h', '10s', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.0.0' - ) - .addRequired( - type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addRequired( - type='panel', id='table', name='Table', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addPanels([ - OsdOverviewGraphPanel( - {"@95%ile": "#e0752d"}, - 'OSD Read Latencies', - '', - 'ms', - null, - '0', - 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', - 'AVG read', - 0, 0, 8, 8 - ) - .addTargets( - [ - addTargetSchema( - 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)', - 1, - 'time_series', - 'MAX read'), - addTargetSchema( - 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', - 1, - 'time_series', - '@95%ile' - )], - ), - addTableSchema( - '$datasource', - 'This table shows the osd\'s that are delivering the 10 highest read latencies within the cluster', - {"col": 2,"desc": true}, - [OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),OsdOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest READ Latencies', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 8, y: 0, w: 4, h: 8}}, - OsdOverviewGraphPanel( - {"@95%ile write": "#e0752d"}, - 'OSD Write Latencies', - '', - 'ms', - null, - '0', - 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', - 'AVG write', - 12, 0, 8, 8 - ) - .addTargets( - [ - addTargetSchema( - 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)', - 1, - 'time_series', - 'MAX write' - ), - addTargetSchema( - 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', - 1, - 'time_series', - '@95%ile write' - )], - ), - addTableSchema( - '$datasource', - 'This table shows the osd\'s that are delivering the 10 highest write latencies within the cluster', - {"col": 2,"desc": true}, - [OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),OsdOverviewStyle('', '/.*/', 'hidden', 'short')], 'Highest WRITE Latencies', 'table' - ) - .addTarget( - addTargetSchema( - 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n', - 1, - 'table', - '' - ) - ) + {gridPos: {x: 20, y: 0, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {}, - '', - 'OSD Types Summary' - ) - .addTarget(addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}')) + {gridPos: {x: 0, y: 8, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {"Non-Encrypted": "#E5AC0E"}, - '', - 'OSD Objectstore Types' - ) - .addTarget(addTargetSchema('count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore')) - .addTarget(addTargetSchema('count(ceph_osd_metadata) - count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'filestore')) - .addTarget(addTargetSchema('absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore')) + {gridPos: {x: 4, y: 8, w: 4, h: 8}}, - OsdOverviewPieChartPanel( - {}, - 'The pie chart shows the various OSD sizes used within the cluster', - 'OSD Size Summary' - ) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB')) - .addTarget(addTargetSchema('count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+')) + {gridPos: {x: 8, y: 8, w: 4, h: 8}}, - g.graphPanel.new(bars=true, datasource='$datasource', title='Distribution of PGs per OSD', x_axis_buckets=20, x_axis_mode='histogram', x_axis_values=['total'], formatY1='short', formatY2='short', labelY1='# of OSDs', min='0', nullPointMode='null') - .addTarget(addTargetSchema('ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD')) + {gridPos: {x: 12, y: 8, w: 8, h: 8}}, - OsdOverviewSingleStatPanel( - ['#d44a3a', '#299c46'], - 'percentunit', - 'OSD onode Hits Ratio', - 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster', - 'current', - true, 1, true, false, - '.75', - 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))', - 'time_series', - 20, 8, 4, 8 - ), - addRowSchema(false, true, 'R/W Profile') + {gridPos: {x: 0, y: 16, w: 24, h: 1}}, - OsdOverviewGraphPanel( - {}, - 'Read/Write Profile', - 'Show the read/write workload profile overtime', - 'short', - null, - null, - 'round(sum(irate(ceph_pool_rd[30s])))', - 'Reads', - 0, 17, 24, 8 - ) - .addTargets([addTargetSchema('round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes')]) - ]) -} -{ - "osd-device-details.json": - local OsdDeviceDetailsPanel(title, description, formatY1, labelY1, expr1, expr2, legendFormat1, legendFormat2, x, y, w, h) = - graphPanelSchema({}, title, description, 'null', false, formatY1, 'short', labelY1, null, null, 1, '$datasource') - .addTargets( - [addTargetSchema(expr1, 1, 'time_series', legendFormat1),addTargetSchema(expr2, 1, 'time_series', legendFormat2)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'OSD device details', '', 'CrAHE0iZz', 'now-3h', '', 16, [], '', {refresh_intervals:['5s','10s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('osd', '$datasource', 'label_values(ceph_osd_metadata,ceph_daemon)', 1, false, 1, 'OSD', '(.*)') - ) - .addPanels([ - addRowSchema(false, true, 'OSD Performance') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - OsdDeviceDetailsPanel( - '$osd Latency', - '', - 's', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])', - 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])', - 'read', - 'write', - 0, 1, 6, 9 - ) - .addSeriesOverride({"alias": "read","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - '$osd R/W IOPS', - '', - 'short', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])', - 'Reads', - 'Writes', - 6, 1, 6, 9 - ) - .addSeriesOverride({"alias": "Reads","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - '$osd R/W Bytes', - '', - 'bytes', - 'Read (-) / Write (+)', - 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])', - 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])', - 'Read Bytes', - 'Write Bytes', - 12, 1, 6, 9 - ) - .addSeriesOverride({"alias": "Read Bytes","transform": "negative-Y"}), - addRowSchema(false, true, 'Physical Device Performance') + {gridPos: {x: 0, y: 10, w: 24, h: 1}}, - OsdDeviceDetailsPanel( - 'Physical Device Latency for $osd', - '', - 's', - 'Read (-) / Write (+)', - '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', - '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))', - '{{instance}}/{{device}} Reads', - '{{instance}}/{{device}} Writes', - 0, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W IOPS for $osd', - '', - 'short', - 'Read (-) / Write (+)', - 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{device}} on {{instance}} Writes', - '{{device}} on {{instance}} Reads', - 6, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - OsdDeviceDetailsPanel( - 'Physical Device R/W Bytes for $osd', - '', - 'Bps', - 'Read (-) / Write (+)', - 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - '{{instance}} {{device}} Reads', - '{{instance}} {{device}} Writes', - 12, 11, 6, 9 - ) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - graphPanelSchema( - {}, - 'Physical Device Util% for $osd', - '', - 'null', - false, - 'percentunit', - 'short', - null, null, null, - 1, - '$datasource' - ) - .addTarget( - addTargetSchema( - 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")', - 1, - 'time_series', - '{{device}} on {{instance}}' - )) + {gridPos: {x: 18, y: 11, w: 6, h: 9}}, - ]) -} -{ - "cephfs-overview.json": - local CephfsOverviewGraphPanel(title, formatY1, labelY1, expr, legendFormat, x, y, w, h) = - graphPanelSchema({}, title, '', 'null', false, formatY1, 'short', labelY1, null, 0, 1, '$datasource') - .addTargets( - [addTargetSchema(expr, 1, 'time_series', legendFormat)]) + {gridPos: {x: x, y: y, w: w, h: h}}; - - dashboardSchema( - 'MDS Performance', '', 'tbO9LAiZz', 'now-1h', '15s', 16, [], '', {refresh_intervals:['5s','10s','15s','30s','1m','5m','15m','30m','1h','2h','1d'],time_options:['5m','15m','1h','6h','12h','24h','2d','7d','30d']} - ) - .addAnnotation( - addAnnotationSchema( - 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard') - ) - .addRequired( - type='grafana', id='grafana', name='Grafana', version='5.3.2' - ) - .addRequired( - type='panel', id='graph', name='Graph', version='5.0.0' - ) - .addTemplate( - g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') - ) - .addTemplate( - addTemplateSchema('mds_servers', '$datasource', 'label_values(ceph_mds_inodes, ceph_daemon)', 1, true, 1, 'MDS Server', '') - ) - .addPanels([ - addRowSchema(false, true, 'MDS Performance') + {gridPos: {x: 0, y: 0, w: 24, h: 1}}, - CephfsOverviewGraphPanel( - 'MDS Workload - $mds_servers', - 'none', - 'Reads(-) / Writes (+)', - 'sum(rate(ceph_objecter_op_r{ceph_daemon=~"($mds_servers).*"}[1m]))', - 'Read Ops', - 0, 1, 12, 9 - ) - .addTarget(addTargetSchema('sum(rate(ceph_objecter_op_w{ceph_daemon=~"($mds_servers).*"}[1m]))', 1, 'time_series', 'Write Ops')) - .addSeriesOverride({"alias": "/.*Reads/","transform": "negative-Y"} - ), - CephfsOverviewGraphPanel( - 'Client Request Load - $mds_servers', - 'none', - 'Client Requests', - 'ceph_mds_server_handle_client_request{ceph_daemon=~"($mds_servers).*"}', - '{{ceph_daemon}}', - 12, 1, 12, 9 - ) - ]) -} diff --git a/monitoring/grafana/dashboards/test-jsonnet.sh b/monitoring/grafana/dashboards/test-jsonnet.sh deleted file mode 100644 index 127992c057554..0000000000000 --- a/monitoring/grafana/dashboards/test-jsonnet.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -set -e -TEMPDIR=`mktemp -d` -BASEDIR=$(dirname "$0") - -JSONNET_PATH="${GRAFONNET_PATH}" jsonnet -m ${TEMPDIR} $BASEDIR/jsonnet/grafana_dashboards.jsonnet - -truncate -s 0 ${TEMPDIR}/json_difference.log -for json_files in $BASEDIR/*.json -do - JSON_FILE_NAME=$(basename $json_files) - for generated_files in ${TEMPDIR}/*.json - do - GENERATED_FILE_NAME=$(basename $generated_files) - if [ $JSON_FILE_NAME == $GENERATED_FILE_NAME ]; then - jsondiff --indent 2 $generated_files $json_files | tee -a ${TEMPDIR}/json_difference.log - fi - done -done - -if [[ $(wc -l < ${TEMPDIR}/json_difference.log) -eq 0 ]] -then - rm -rf ${TEMPDIR} - echo "Congratulations! Grafonnet Check Passed" -else - rm -rf ${TEMPDIR} - echo "Grafonnet Check Failed, failed comparing generated file with existing" - exit 1 -fi diff --git a/monitoring/grafana/dashboards/tox.ini b/monitoring/grafana/dashboards/tox.ini deleted file mode 100644 index 382952c5b1bee..0000000000000 --- a/monitoring/grafana/dashboards/tox.ini +++ /dev/null @@ -1,44 +0,0 @@ -[tox] -envlist = grafonnet-{check,fix},lint,promql-query-test -skipsdist = true - -[grafonnet] -deps = - -rrequirements-grafonnet.txt - -[testenv:grafonnet-{check,fix}] -basepython = python3 -whitelist_externals = - jsonnet - bash -description = - check: Ensure that auto-generated grafana dashboard files matches the current version - fix: generate dashboard json files from jsonnet file with latest changes -deps = - {[grafonnet]deps} -passenv = GRAFONNET_PATH -commands = - check: bash test-jsonnet.sh - fix: jsonnet -m . jsonnet/grafana_dashboards.jsonnet - - -[testenv:lint] -description = - Run linters -deps = - -rrequirements-lint.txt -setenv = -commands = - pylint --rcfile=.pylintrc tests - mypy tests - isort tests - -[testenv:promql-query-test] -description = - Run promtool unit testing on grafana queries. -deps = - -rtests/requirements.txt -depends = grafonnet-check -setenv = -commands = - behave tests/features diff --git a/monitoring/prometheus/CMakeLists.txt b/monitoring/prometheus/CMakeLists.txt deleted file mode 100644 index 88c05163602af..0000000000000 --- a/monitoring/prometheus/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(tests) diff --git a/monitoring/prometheus/README.md b/monitoring/prometheus/README.md deleted file mode 100644 index fde63a35fe2ee..0000000000000 --- a/monitoring/prometheus/README.md +++ /dev/null @@ -1,7 +0,0 @@ -## Prometheus related bits - -### Alerts -In monitoring/prometheus/alerts you'll find a set of Prometheus alert rules that -should provide a decent set of default alerts for a Ceph cluster. Just put this -file in a place according to your Prometheus configuration (wherever the `rules` -configuration stanza points). diff --git a/monitoring/prometheus/tests/CMakeLists.txt b/monitoring/prometheus/tests/CMakeLists.txt deleted file mode 100644 index 15fce8e1e02b7..0000000000000 --- a/monitoring/prometheus/tests/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(WITH_TESTS) - include(AddCephTest) - # add_tox_test(prometheus-alerts ${CMAKE_CURRENT_SOURCE_DIR} TOX_ENVS py3) -endif() diff --git a/monitoring/prometheus/tests/settings.py b/monitoring/prometheus/tests/settings.py deleted file mode 100644 index c54f141a3edf6..0000000000000 --- a/monitoring/prometheus/tests/settings.py +++ /dev/null @@ -1,2 +0,0 @@ -ALERTS_FILE = '../alerts/ceph_default_alerts.yml' -UNIT_TESTS_FILE = 'test_alerts.yml' \ No newline at end of file diff --git a/monitoring/prometheus/tests/tox.ini b/monitoring/prometheus/tests/tox.ini deleted file mode 100644 index b96390160b70f..0000000000000 --- a/monitoring/prometheus/tests/tox.ini +++ /dev/null @@ -1,11 +0,0 @@ -[tox] -envlist = py3 -skipsdist = true - -[testenv] -deps = - -rrequirements.txt - pytest -commands = - pytest -rA test_syntax.py test_unittests.py - ./validate_rules.py diff --git a/src/pybind/mgr/dashboard/grafana.py b/src/pybind/mgr/dashboard/grafana.py index 2ba28fbeb6f4c..2e34187bf3f5d 100644 --- a/src/pybind/mgr/dashboard/grafana.py +++ b/src/pybind/mgr/dashboard/grafana.py @@ -105,7 +105,7 @@ def load_local_dashboards(): if os.environ.get('CEPH_DEV') == '1' or 'UNITTEST' in os.environ: path = os.path.abspath(os.path.join( os.path.dirname(__file__), - '../../../../monitoring/grafana/dashboards/' + '../../../../monitoring/ceph-mixin/dashboards_out/' )) else: path = '/etc/grafana/dashboards/ceph-dashboard' diff --git a/src/pybind/mgr/dashboard/tox.ini b/src/pybind/mgr/dashboard/tox.ini index 72296218c9bb8..4ba95ac24d7cf 100644 --- a/src/pybind/mgr/dashboard/tox.ini +++ b/src/pybind/mgr/dashboard/tox.ini @@ -153,7 +153,7 @@ commands = [testenv:check] commands = - python ci/check_grafana_dashboards.py frontend/src/app ../../../../monitoring/grafana/dashboards + python ci/check_grafana_dashboards.py frontend/src/app ../../../../monitoring/ceph-mixin/dashboards_out [testenv:openapi-{check,fix}] basepython = python3 diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index a82423ddab283..bf49847937dbb 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -591,24 +591,6 @@ add_ceph_test(run-cli-tests ${CMAKE_CURRENT_SOURCE_DIR}/run-cli-tests) add_ceph_test(smoke.sh ${CMAKE_CURRENT_SOURCE_DIR}/smoke.sh) -find_program(PROMTOOL_EXECUTABLE promtool) -if(PROMTOOL_EXECUTABLE) - execute_process( - COMMAND ${PROMTOOL_EXECUTABLE} test rules /dev/null - RESULT_VARIABLE rc - OUTPUT_QUIET) - if(NOT rc) - add_ceph_test(run-promtool-unittests - ${PROMTOOL_EXECUTABLE} test rules ${CMAKE_SOURCE_DIR}/monitoring/prometheus/tests/test_alerts.yml) - elseif(NOT promtool_executable_checked) - message(WARNING "'${PROMTOOL_EXECUTABLE} test rules' does not work, " - "please use a newer prometheus") - endif() -else() - add_ceph_test(run-promtool-unittests - {CMAKE_CURRENT_SOURCE_DIR}/run-promtool-unittests.sh) -endif() - set_property( TEST ${tox_tests} PROPERTY ENVIRONMENT ${env_vars_for_tox_tests}) -- 2.39.5