mgr/prometheus: offer ability to disable cache

author Patrick Seidensal <pseidensal@suse.com>

Thu, 26 Aug 2021 13:35:49 +0000 (15:35 +0200)

committer Ernesto Puerta <epuertat@redhat.com>

Mon, 15 Nov 2021 16:56:30 +0000 (17:56 +0100)
author Patrick Seidensal <pseidensal@suse.com>
Thu, 26 Aug 2021 13:35:49 +0000 (15:35 +0200)
committer Ernesto Puerta <epuertat@redhat.com>
Mon, 15 Nov 2021 16:56:30 +0000 (17:56 +0100)
diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst

index 2c0fff5dcf7d87ed964943ae6db96b6b2c7c668e..0f1caff2353ffba1fd4fe6f30b72084eb5b0c925 100644 (file)
--- a/doc/mgr/prometheus.rst
+++ b/doc/mgr/prometheus.rst
@@ -60,13 +60,12 @@ To set a different scrape interval in the Prometheus module, set
      ceph config set mgr mgr/prometheus/scrape_interval 20
  
  On large clusters (>1000 OSDs), the time to fetch the metrics may become
-significant.  Without the cache, the Prometheus manager module could,
-especially in conjunction with multiple Prometheus instances, overload the
-manager and lead to unresponsive or crashing Ceph manager instances.  Hence,
-the cache is enabled by default and cannot be disabled.  This means that there
-is a possibility that the cache becomes stale.  The cache is considered stale
-when the time to fetch the metrics from Ceph exceeds the configured
-``scrape_interval``.
+significant.  Without the cache, the Prometheus manager module could, especially
+in conjunction with multiple Prometheus instances, overload the manager and lead
+to unresponsive or crashing Ceph manager instances.  Hence, the cache is enabled
+by default.  This means that there is a possibility that the cache becomes
+stale.  The cache is considered stale when the time to fetch the metrics from
+Ceph exceeds the configured :confval:``mgr/prometheus/scrape_interval``.
  
  If that is the case, **a warning will be logged** and the module will either
  
@@ -85,6 +84,10 @@ To tell the module to respond with "service unavailable", set it to ``fail``::
  
      ceph config set mgr mgr/prometheus/stale_cache_strategy fail
  
+If you are confident that you don't require the cache, you can disable it::
+
+    ceph config set mgr mgr/prometheus/cache false
+
  .. _prometheus-rbd-io-statistics:
  
  RBD IO statistics
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index e6823e3d996c8fa995719a8665a7c8db0bc90528..896c0f4e3c5ac21f27854d10ce69c72e94513a40 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -281,6 +281,11 @@ class Module(MgrModule):
              'stale_cache_strategy',
              default='log'
          ),
+        Option(
+            'cache',
+            type='bool',
+            default=True,
+        ),
          Option(
              'rbd_stats_pools',
              default=''
@@ -302,6 +307,7 @@ class Module(MgrModule):
          self.collect_lock = threading.Lock()
          self.collect_time = 0.0
          self.scrape_interval: float = 15.0
+        self.cache = True
          self.stale_cache_strategy: str = self.STALE_CACHE_FAIL
          self.collect_cache: Optional[str] = None
          self.rbd_stats = {
@@ -1317,6 +1323,11 @@ class Module(MgrModule):
  
              @staticmethod
              def _metrics(instance: 'Module') -> Optional[str]:
+                if not self.cache:
+                    self.log.debug('Cache disabled, collecting and returning without cache')
+                    cherrypy.response.headers['Content-Type'] = 'text/plain'
+                    return self.collect()
+
                  # Return cached data if available
                  if not instance.collect_cache:
                      raise cherrypy.HTTPError(503, 'No cached data available yet')
@@ -1372,7 +1383,12 @@ class Module(MgrModule):
              (server_addr, server_port)
          )
  
-        self.metrics_thread.start()
+        self.cache = cast(bool, self.get_localized_module_option('cache', True))
+        if self.cache:
+            self.log.info('Cache enabled')
+            self.metrics_thread.start()
+        else:
+            self.log.info('Cache disabled')
  
          cherrypy.config.update({
              'server.socket_host': server_addr,
author	Patrick Seidensal <pseidensal@suse.com>
	Thu, 26 Aug 2021 13:35:49 +0000 (15:35 +0200)
committer	Ernesto Puerta <epuertat@redhat.com>
	Mon, 15 Nov 2021 16:56:30 +0000 (17:56 +0100)
doc/mgr/prometheus.rst		patch \| blob \| history
src/pybind/mgr/prometheus/module.py		patch \| blob \| history