From df7d30ca5b7965f8ea8e617ef682e9a1168c6ec3 Mon Sep 17 00:00:00 2001
From: Patrick Seidensal <pseidensal@suse.com>
Date: Thu, 26 Aug 2021 15:35:49 +0200
Subject: [PATCH] mgr/prometheus: offer ability to disable cache

Fixes: https://tracker.ceph.com/issues/52414

Signed-off-by: Patrick Seidensal <pseidensal@suse.com>
---
 doc/mgr/prometheus.rst              | 18 +++++++++++-------
 src/pybind/mgr/prometheus/module.py | 18 +++++++++++++++++-
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst
index 1c52b248afb..0328a582f14 100644
--- a/doc/mgr/prometheus.rst
+++ b/doc/mgr/prometheus.rst
@@ -33,6 +33,7 @@ Configuration
 .. confval:: server_addr
 .. confval:: server_port
 .. confval:: scrape_interval
+.. confval:: cache
 .. confval:: stale_cache_strategy
 .. confval:: rbd_stats_pools
 .. confval:: rbd_stats_pools_refresh_interval
@@ -67,13 +68,12 @@ To set a different scrape interval in the Prometheus module, set
     ceph config set mgr mgr/prometheus/scrape_interval 20
 
 On large clusters (>1000 OSDs), the time to fetch the metrics may become
-significant.  Without the cache, the Prometheus manager module could,
-especially in conjunction with multiple Prometheus instances, overload the
-manager and lead to unresponsive or crashing Ceph manager instances.  Hence,
-the cache is enabled by default and cannot be disabled.  This means that there
-is a possibility that the cache becomes stale.  The cache is considered stale
-when the time to fetch the metrics from Ceph exceeds the configured
-``scrape_interval``.
+significant.  Without the cache, the Prometheus manager module could, especially
+in conjunction with multiple Prometheus instances, overload the manager and lead
+to unresponsive or crashing Ceph manager instances.  Hence, the cache is enabled
+by default.  This means that there is a possibility that the cache becomes
+stale.  The cache is considered stale when the time to fetch the metrics from
+Ceph exceeds the configured :confval:``mgr/prometheus/scrape_interval``.
 
 If that is the case, **a warning will be logged** and the module will either
 
@@ -92,6 +92,10 @@ To tell the module to respond with "service unavailable", set it to ``fail``::
 
     ceph config set mgr mgr/prometheus/stale_cache_strategy fail
 
+If you are confident that you don't require the cache, you can disable it::
+
+    ceph config set mgr mgr/prometheus/cache false
+
 .. _prometheus-rbd-io-statistics:
 
 RBD IO statistics
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
index 796eddfc8ac..746ad9ceaf3 100644
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -285,6 +285,11 @@ class Module(MgrModule):
             'stale_cache_strategy',
             default='log'
         ),
+        Option(
+            'cache',
+            type='bool',
+            default=True,
+        ),
         Option(
             'rbd_stats_pools',
             default=''
@@ -306,6 +311,7 @@ class Module(MgrModule):
         self.collect_lock = threading.Lock()
         self.collect_time = 0.0
         self.scrape_interval: float = 15.0
+        self.cache = True
         self.stale_cache_strategy: str = self.STALE_CACHE_FAIL
         self.collect_cache: Optional[str] = None
         self.rbd_stats = {
@@ -1321,6 +1327,11 @@ class Module(MgrModule):
 
             @staticmethod
             def _metrics(instance: 'Module') -> Optional[str]:
+                if not self.cache:
+                    self.log.debug('Cache disabled, collecting and returning without cache')
+                    cherrypy.response.headers['Content-Type'] = 'text/plain'
+                    return self.collect()
+
                 # Return cached data if available
                 if not instance.collect_cache:
                     raise cherrypy.HTTPError(503, 'No cached data available yet')
@@ -1376,7 +1387,12 @@ class Module(MgrModule):
             (server_addr, server_port)
         )
 
-        self.metrics_thread.start()
+        self.cache = cast(bool, self.get_localized_module_option('cache', True))
+        if self.cache:
+            self.log.info('Cache enabled')
+            self.metrics_thread.start()
+        else:
+            self.log.info('Cache disabled')
 
         # Publish the URI that others may use to access the service we're
         # about to start serving
-- 
2.39.5