mgr/prometheus: introduce fetch_perf_counters_metrics module option

author Avan Thakkar <athakkar@redhat.com>

Thu, 1 Dec 2022 06:06:56 +0000 (11:36 +0530)

committer Avan Thakkar <athakkar@redhat.com>

Tue, 4 Apr 2023 17:42:29 +0000 (23:12 +0530)
author Avan Thakkar <athakkar@redhat.com>
Thu, 1 Dec 2022 06:06:56 +0000 (11:36 +0530)
committer Avan Thakkar <athakkar@redhat.com>
Tue, 4 Apr 2023 17:42:29 +0000 (23:12 +0530)
diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst

index cb5053451053b0f6f22f7a66721f88e46e2db1b6..afbbdcf268e150244ab1e62a3e92c277500652b7 100644 (file)
--- a/doc/mgr/prometheus.rst
+++ b/doc/mgr/prometheus.rst
@@ -39,6 +39,7 @@ Configuration
  .. confval:: rbd_stats_pools_refresh_interval
  .. confval:: standby_behaviour
  .. confval:: standby_error_status_code
+.. confval:: exclude_perf_counters
  
  By default the module will accept HTTP requests on port ``9283`` on all IPv4
  and IPv6 addresses on the host.  The port and listen address are both
@@ -184,6 +185,15 @@ Example to turn up the sync interval to 10 minutes::
  
    ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600
  
+Ceph daemon performance counters metrics
+-----------------------------------------
+
+With the introduction of ``ceph-exporter`` daemon, the prometheus module will no longer export Ceph daemon
+perf counters as prometheus metrics by default. However, one may re-enable exporting these metrics by setting
+the module option ``exclude_perf_counters`` to ``false``::
+
+    ceph config set mgr mgr/prometheus/exclude_perf_counters false
+
  Statistic names and labels
  ==========================
  
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index 2689e69f1a9e22674660c970426a2dfe4aeeaf1d..ade068b0c368147b98eccc698caf16b88ee1a2f3 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -598,6 +598,14 @@ class Module(MgrModule):
              min=400,
              max=599,
              runtime=True
+        ),
+        Option(
+            name='exclude_perf_counters',
+            type='bool',
+            default=True,
+            desc='Do not include perf-counters in the metrics output',
+            long_desc='Gathering perf-counters from a single Prometheus exporter can degrade ceph-mgr performance, especially in large clusters. Instead, Ceph-exporter daemons are now used by default for perf-counter gathering. This should only be disabled when no ceph-exporters are deployed.',
+            runtime=True
          )
      ]
  
@@ -1618,26 +1626,10 @@ class Module(MgrModule):
                  self.metrics[path].set(health_metric['value'], labelvalues=(
                      health_metric['type'], daemon_name,))
  
-    @profile_method(True)
-    def collect(self) -> str:
-        # Clear the metrics before scraping
-        for k in self.metrics.keys():
-            self.metrics[k].clear()
-
-        self.get_health()
-        self.get_df()
-        self.get_osd_blocklisted_entries()
-        self.get_pool_stats()
-        self.get_fs()
-        self.get_osd_stats()
-        self.get_quorum_status()
-        self.get_mgr_status()
-        self.get_metadata_and_osd_status()
-        self.get_pg_status()
-        self.get_pool_repaired_objects()
-        self.get_num_objects()
-        self.get_all_daemon_health_metrics()
-
+    def get_perf_counters(self) -> None:
+        """
+        Get the perf counters for all daemons
+        """
          for daemon, counters in self.get_all_perf_counters().items():
              for path, counter_info in counters.items():
                  # Skip histograms, they are represented by long running avgs
@@ -1664,7 +1656,6 @@ class Module(MgrModule):
                              label_names,
                          )
                      self.metrics[_path].set(value, labels)
-
                      _path = path + '_count'
                      if _path not in self.metrics:
                          self.metrics[_path] = Metric(
@@ -1683,8 +1674,30 @@ class Module(MgrModule):
                              label_names,
                          )
                      self.metrics[path].set(value, labels)
-
          self.add_fixed_name_metrics()
+
+    @profile_method(True)
+    def collect(self) -> str:
+        # Clear the metrics before scraping
+        for k in self.metrics.keys():
+            self.metrics[k].clear()
+
+        self.get_health()
+        self.get_df()
+        self.get_osd_blocklisted_entries()
+        self.get_pool_stats()
+        self.get_fs()
+        self.get_osd_stats()
+        self.get_quorum_status()
+        self.get_mgr_status()
+        self.get_metadata_and_osd_status()
+        self.get_pg_status()
+        self.get_pool_repaired_objects()
+        self.get_num_objects()
+        self.get_all_daemon_health_metrics()
+
+        if not self.get_module_option('exclude_perf_counters'):
+            self.get_perf_counters()
          self.get_rbd_stats()
  
          self.get_collect_time_metrics()
author	Avan Thakkar <athakkar@redhat.com>
	Thu, 1 Dec 2022 06:06:56 +0000 (11:36 +0530)
committer	Avan Thakkar <athakkar@redhat.com>
	Tue, 4 Apr 2023 17:42:29 +0000 (23:12 +0530)
doc/mgr/prometheus.rst		patch \| blob \| history
src/pybind/mgr/prometheus/module.py		patch \| blob \| history