From 6a6b674f61c647aa161f65e40977ad7f1e5e8f0d Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Thu, 1 Dec 2022 11:36:56 +0530 Subject: [PATCH] mgr/prometheus: introduce fetch_perf_counters_metrics module option Fixes: https://tracker.ceph.com/issues/58164 Signed-off-by: Avan Thakkar Introducing module option in prometheus module to enable/disable support for exporting ceph daemons perf counters as prometheus metrics, by default this option will be disabled. The use case for this option is in case if ceph-exporter deployment failed for any reason then user still can have option to fetch metrics from promethues exporter. --- doc/mgr/prometheus.rst | 10 +++++ src/pybind/mgr/prometheus/module.py | 57 ++++++++++++++++++----------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 698b6a2d539d7..25a7b0d084ee6 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -41,6 +41,7 @@ Configuration .. confval:: rbd_stats_pools_refresh_interval .. confval:: standby_behaviour .. confval:: standby_error_status_code +.. confval:: exclude_perf_counters By default the module will accept HTTP requests on port ``9283`` on all IPv4 and IPv6 addresses on the host. The port and listen address are both @@ -217,6 +218,15 @@ the module option ``exclude_perf_counters`` to ``false``: ceph config set mgr mgr/prometheus/exclude_perf_counters false +Ceph daemon performance counters metrics +----------------------------------------- + +With the introduction of ``ceph-exporter`` daemon, the prometheus module will no longer export Ceph daemon +perf counters as prometheus metrics by default. However, one may re-enable exporting these metrics by setting +the module option ``exclude_perf_counters`` to ``false``:: + + ceph config set mgr mgr/prometheus/exclude_perf_counters false + Statistic names and labels ========================== diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 566b649a0fdc4..f2e97c9d183ad 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -600,6 +600,14 @@ class Module(MgrModule, OrchestratorClientMixin): min=400, max=599, runtime=True + ), + Option( + name='exclude_perf_counters', + type='bool', + default=True, + desc='Do not include perf-counters in the metrics output', + long_desc='Gathering perf-counters from a single Prometheus exporter can degrade ceph-mgr performance, especially in large clusters. Instead, Ceph-exporter daemons are now used by default for perf-counter gathering. This should only be disabled when no ceph-exporters are deployed.', + runtime=True ) ] @@ -1640,26 +1648,10 @@ class Module(MgrModule, OrchestratorClientMixin): self.metrics[path].set(health_metric['value'], labelvalues=( health_metric['type'], daemon_name,)) - @profile_method(True) - def collect(self) -> str: - # Clear the metrics before scraping - for k in self.metrics.keys(): - self.metrics[k].clear() - - self.get_health() - self.get_df() - self.get_osd_blocklisted_entries() - self.get_pool_stats() - self.get_fs() - self.get_osd_stats() - self.get_quorum_status() - self.get_mgr_status() - self.get_metadata_and_osd_status() - self.get_pg_status() - self.get_pool_repaired_objects() - self.get_num_objects() - self.get_all_daemon_health_metrics() - + def get_perf_counters(self) -> None: + """ + Get the perf counters for all daemons + """ for daemon, counters in self.get_all_perf_counters().items(): for path, counter_info in counters.items(): # Skip histograms, they are represented by long running avgs @@ -1686,7 +1678,6 @@ class Module(MgrModule, OrchestratorClientMixin): label_names, ) self.metrics[_path].set(value, labels) - _path = path + '_count' if _path not in self.metrics: self.metrics[_path] = Metric( @@ -1705,8 +1696,30 @@ class Module(MgrModule, OrchestratorClientMixin): label_names, ) self.metrics[path].set(value, labels) - self.add_fixed_name_metrics() + + @profile_method(True) + def collect(self) -> str: + # Clear the metrics before scraping + for k in self.metrics.keys(): + self.metrics[k].clear() + + self.get_health() + self.get_df() + self.get_osd_blocklisted_entries() + self.get_pool_stats() + self.get_fs() + self.get_osd_stats() + self.get_quorum_status() + self.get_mgr_status() + self.get_metadata_and_osd_status() + self.get_pg_status() + self.get_pool_repaired_objects() + self.get_num_objects() + self.get_all_daemon_health_metrics() + + if not self.get_module_option('exclude_perf_counters'): + self.get_perf_counters() self.get_rbd_stats() self.get_collect_time_metrics() -- 2.39.5