From: Pere Diaz Bou Date: Fri, 11 Nov 2022 09:43:01 +0000 (+0100) Subject: mgr/prometheus: expose daemon health metrics X-Git-Tag: v17.2.6~264^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6a851ba3beee3cca058fc3a0932d2b4878ff335d;p=ceph.git mgr/prometheus: expose daemon health metrics Until now daemon health metrics were stored without being used. One of the most helpful metrics there is SLOW_OPS with respect to OSDs and MONs which this commit tries to expose to bring fine grained metrics to find troublesome OSDs instead of having a lone healthcheck of slow ops in the whole cluster. Signed-off-by: Pere Diaz Bou (cherry picked from commit 5a2b7c25b68f2c955356640041e4c7ed72416d4e) --- diff --git a/doc/mgr/modules.rst b/doc/mgr/modules.rst index 454839e2abd4..667664139739 100644 --- a/doc/mgr/modules.rst +++ b/doc/mgr/modules.rst @@ -508,6 +508,7 @@ function. This will result in a circular locking exception. .. automethod:: MgrModule.get_perf_schema .. automethod:: MgrModule.get_counter .. automethod:: MgrModule.get_mgr_id +.. automethod:: MgrModule.get_daemon_health_metrics Exposing health checks ---------------------- diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet index 3e0b31f2c459..1c66120af9c2 100644 --- a/monitoring/ceph-mixin/dashboards/host.libsonnet +++ b/monitoring/ceph-mixin/dashboards/host.libsonnet @@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet'; 11, 9 ), + $.addTableSchema( + '$datasource', + 'This table shows the 10 hosts with the highest number of slow ops', + { col: 2, desc: true }, + [ + $.overviewStyle('Instance', 'instance', 'string', 'short'), + $.overviewStyle('Slow Ops', 'Value', 'number', 'none'), + $.overviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Top Slow Ops per Host', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } }, ]), } diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet index 129b74ba6669..0ea43c96ff9f 100644 --- a/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet'; .addTargets([$.addTargetSchema( 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes' )]), + $.addTableSchema( + '$datasource', + 'This table shows the 10 OSDs with the highest number of slow ops', + { col: 2, desc: true }, + [ + $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'), + $.overviewStyle('Slow Ops', 'Value', 'number', 'none'), + $.overviewStyle('', '/.*/', 'hidden', 'short'), + ], + 'Top Slow Ops', + 'table' + ) + .addTarget( + $.addTargetSchema( + ||| + topk(10, + (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}) + ) + ||| % $.matchers(), + '', + 'table', + 1, + true + ) + ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } }, ]), 'osd-device-details.json': local OsdDeviceDetailsPanel(title, diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json index 93c51f009410..b8e08697a98b 100644 --- a/monitoring/ceph-mixin/dashboards_out/host-details.json +++ b/monitoring/ceph-mixin/dashboards_out/host-details.json @@ -1119,6 +1119,91 @@ "show": true } ] + }, + { + "columns": [ ], + "datasource": "$datasource", + "description": "This table shows the 10 hosts with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 40 + }, + "id": 15, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "Instance", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops per Host", + "transform": "table", + "type": "table" } ], "refresh": "30s", diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json index 5ea8955b2974..b34c6642263d 100644 --- a/monitoring/ceph-mixin/dashboards_out/osds-overview.json +++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json @@ -860,6 +860,91 @@ "show": true } ] + }, + { + "columns": [ ], + "datasource": "$datasource", + "description": "This table shows the 10 OSDs with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 20 + }, + "id": 13, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "OSD ID", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "ceph_daemon", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops", + "transform": "table", + "type": "table" } ], "refresh": "30s", diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index bed89a879064..7977e4035ecb 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -629,6 +629,17 @@ description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)', }, }, + { + alert: 'CephDaemonSlowOps', + 'for': '30s', + expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops', + summary: '{{ $labels.ceph_daemon }} operations are slow to complete', + description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)', + }, + }, ], }, { diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index bd773f27c540..33a5c5059b89 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -563,6 +563,16 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "CephDaemonSlowOps" + for: "30s" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + labels: + severity: 'warning' + type: 'ceph_default' + annotations: + summary: "{{ $labels.ceph_daemon }} operations are slow to complete" + description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" - name: "cephadm" rules: - alert: "CephadmUpgradeFailed" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 8cdb56349360..d68b43badf2b 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -679,6 +679,33 @@ tests: summary: OSD operations are slow to complete description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + # slow daemon ops + - interval : 1m + input_series: + - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}' + values: '1+0x120' + promql_expr_test: + - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0' + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283", + job="ceph", type="SLOW_OPS"}' + value: 1 + alert_rule_test: + - eval_time: 20m + alertname: CephDaemonSlowOps + exp_alerts: + - exp_labels: + instance: ceph:9283 + ceph_daemon: "osd.1" + job: ceph + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: osd.1 operations are slow to complete + description: "osd.1 operations are taking too long to process (complaint time exceeded)" + # CEPHADM orchestrator alert triggers - interval: 30s input_series: diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index 58c3d9ee4d6f..8c070dd8cbf4 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -1515,3 +1515,26 @@ void ActivePyModules::unregister_client(std::string_view name, std::string addrs dout(7) << "unregistering msgr client handle " << addrv << dendl; py_module_registry.unregister_client(name, addrv); } + +PyObject* ActivePyModules::get_daemon_health_metrics() +{ + without_gil_t no_gil; + return daemon_state.with_daemons_by_server([&no_gil] + (const std::map &all) { + no_gil.acquire_gil(); + PyFormatter f; + for (const auto &[hostname, daemon_state] : all) { + for (const auto &[key, state] : daemon_state) { + f.open_array_section(ceph::to_string(key)); + for (const auto &metric : state->daemon_health_metrics) { + f.open_object_section(metric.get_type_name()); + f.dump_int("value", metric.get_n1()); + f.dump_string("type", metric.get_type_name()); + f.close_section(); + } + f.close_section(); + } + } + return f.get(); + }); +} diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h index 218497c1e898..6015084e3cdc 100644 --- a/src/mgr/ActivePyModules.h +++ b/src/mgr/ActivePyModules.h @@ -222,6 +222,7 @@ public: void cluster_log(const std::string &channel, clog_type prio, const std::string &message); + PyObject* get_daemon_health_metrics(); bool inject_python_on() const; void update_cache_metrics(); diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc index ca441d5e539d..4fb5b250b98b 100644 --- a/src/mgr/BaseMgrModule.cc +++ b/src/mgr/BaseMgrModule.cc @@ -1411,6 +1411,12 @@ ceph_unregister_client(BaseMgrModule *self, PyObject *args) Py_RETURN_NONE; } +static PyObject* +ceph_get_daemon_health_metrics(BaseMgrModule *self, PyObject *args) +{ + return self->py_modules->get_daemon_health_metrics(); +} + PyMethodDef BaseMgrModule_methods[] = { {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS, "Get a cluster object"}, @@ -1540,6 +1546,9 @@ PyMethodDef BaseMgrModule_methods[] = { {"_ceph_unregister_client", (PyCFunction)ceph_unregister_client, METH_VARARGS, "Unregister RADOS instance for potential blocklisting"}, + {"_ceph_get_daemon_health_metrics", (PyCFunction)ceph_get_daemon_health_metrics, + METH_VARARGS, "Get health metrics for all daemons"}, + {NULL, NULL, 0, NULL} }; diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h index ad3ea29efd46..ce0dad2c87e3 100644 --- a/src/mgr/DaemonHealthMetric.h +++ b/src/mgr/DaemonHealthMetric.h @@ -44,8 +44,9 @@ public: : type(type_), value(n) {} DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2) - : type(type_), value(n1, n2) + : type(type_), value(n1, n2) {} + daemon_metric get_type() const { return type; } @@ -58,6 +59,7 @@ public: uint32_t get_n2() const { return value.n2; } + DENC(DaemonHealthMetric, v, p) { DENC_START(1, 1, p); denc(v.type, p); @@ -65,6 +67,10 @@ public: DENC_FINISH(p); } + std::string get_type_name() const { + return daemon_metric_name(get_type()); + } + friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) { return out << daemon_metric_name(m.get_type()) << "(" << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))"; diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index 1b4744cf6b35..4afae0c20d8c 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -2640,8 +2640,6 @@ void DaemonServer::send_report() << std::dec << dendl; continue; } - dout(20) << " + " << state->key << " " - << metric << dendl; tie(acc, std::ignore) = accumulated.emplace(metric.get_type(), std::move(collector)); } diff --git a/src/pybind/mgr/ceph_module.pyi b/src/pybind/mgr/ceph_module.pyi index 171919295761..b89402d01be0 100644 --- a/src/pybind/mgr/ceph_module.pyi +++ b/src/pybind/mgr/ceph_module.pyi @@ -115,3 +115,4 @@ class BaseMgrModule(object): def _ceph_unregister_client(self, addrs: str) -> None: ... def _ceph_register_client(self, addrs: str) -> None: ... def _ceph_is_authorized(self, arguments: Dict[str, str]) -> bool: ... + def _ceph_get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]: ... diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 4afea482eef8..ff15ff69c772 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -2273,6 +2273,13 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin): """ return self._ceph_get_mds_perf_counters(query_id) + def get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]: + """ + Get the list of health metrics per daemon. This includes SLOW_OPS health metrics + in MON and OSD daemons, and PENDING_CREATING_PGS health metrics for OSDs. + """ + return self._ceph_get_daemon_health_metrics() + def is_authorized(self, arguments: Dict[str, str]) -> bool: """ Verifies that the current session caps permit executing the py service diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 9afb24f7acbf..b06a73241d9a 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -1597,6 +1597,21 @@ class Module(MgrModule): self.metrics[path].set(stats['stat_sum']['num_objects_repaired'], labelvalues=(stats['poolid'],)) + def get_all_daemon_health_metrics(self) -> None: + daemon_metrics = self.get_daemon_health_metrics() + self.log.debug('metrics jeje %s' % (daemon_metrics)) + for daemon_name, health_metrics in daemon_metrics.items(): + for health_metric in health_metrics: + path = f'daemon_health_metrics{daemon_name}{health_metric["type"]}' + self.metrics[path] = Metric( + 'counter', + 'daemon_health_metrics', + 'Health metrics for Ceph daemons', + ('type', 'ceph_daemon',) + ) + self.metrics[path].set(health_metric['value'], labelvalues=( + health_metric['type'], daemon_name,)) + @profile_method(True) def collect(self) -> str: # Clear the metrics before scraping @@ -1615,6 +1630,7 @@ class Module(MgrModule): self.get_pg_status() self.get_pool_repaired_objects() self.get_num_objects() + self.get_all_daemon_health_metrics() for daemon, counters in self.get_all_perf_counters().items(): for path, counter_info in counters.items():