From df7d30ca5b7965f8ea8e617ef682e9a1168c6ec3 Mon Sep 17 00:00:00 2001 From: Patrick Seidensal Date: Thu, 26 Aug 2021 15:35:49 +0200 Subject: [PATCH] mgr/prometheus: offer ability to disable cache Fixes: https://tracker.ceph.com/issues/52414 Signed-off-by: Patrick Seidensal --- doc/mgr/prometheus.rst | 18 +++++++++++------- src/pybind/mgr/prometheus/module.py | 18 +++++++++++++++++- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 1c52b248afb..0328a582f14 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -33,6 +33,7 @@ Configuration .. confval:: server_addr .. confval:: server_port .. confval:: scrape_interval +.. confval:: cache .. confval:: stale_cache_strategy .. confval:: rbd_stats_pools .. confval:: rbd_stats_pools_refresh_interval @@ -67,13 +68,12 @@ To set a different scrape interval in the Prometheus module, set ceph config set mgr mgr/prometheus/scrape_interval 20 On large clusters (>1000 OSDs), the time to fetch the metrics may become -significant. Without the cache, the Prometheus manager module could, -especially in conjunction with multiple Prometheus instances, overload the -manager and lead to unresponsive or crashing Ceph manager instances. Hence, -the cache is enabled by default and cannot be disabled. This means that there -is a possibility that the cache becomes stale. The cache is considered stale -when the time to fetch the metrics from Ceph exceeds the configured -``scrape_interval``. +significant. Without the cache, the Prometheus manager module could, especially +in conjunction with multiple Prometheus instances, overload the manager and lead +to unresponsive or crashing Ceph manager instances. Hence, the cache is enabled +by default. This means that there is a possibility that the cache becomes +stale. The cache is considered stale when the time to fetch the metrics from +Ceph exceeds the configured :confval:``mgr/prometheus/scrape_interval``. If that is the case, **a warning will be logged** and the module will either @@ -92,6 +92,10 @@ To tell the module to respond with "service unavailable", set it to ``fail``:: ceph config set mgr mgr/prometheus/stale_cache_strategy fail +If you are confident that you don't require the cache, you can disable it:: + + ceph config set mgr mgr/prometheus/cache false + .. _prometheus-rbd-io-statistics: RBD IO statistics diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 796eddfc8ac..746ad9ceaf3 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -285,6 +285,11 @@ class Module(MgrModule): 'stale_cache_strategy', default='log' ), + Option( + 'cache', + type='bool', + default=True, + ), Option( 'rbd_stats_pools', default='' @@ -306,6 +311,7 @@ class Module(MgrModule): self.collect_lock = threading.Lock() self.collect_time = 0.0 self.scrape_interval: float = 15.0 + self.cache = True self.stale_cache_strategy: str = self.STALE_CACHE_FAIL self.collect_cache: Optional[str] = None self.rbd_stats = { @@ -1321,6 +1327,11 @@ class Module(MgrModule): @staticmethod def _metrics(instance: 'Module') -> Optional[str]: + if not self.cache: + self.log.debug('Cache disabled, collecting and returning without cache') + cherrypy.response.headers['Content-Type'] = 'text/plain' + return self.collect() + # Return cached data if available if not instance.collect_cache: raise cherrypy.HTTPError(503, 'No cached data available yet') @@ -1376,7 +1387,12 @@ class Module(MgrModule): (server_addr, server_port) ) - self.metrics_thread.start() + self.cache = cast(bool, self.get_localized_module_option('cache', True)) + if self.cache: + self.log.info('Cache enabled') + self.metrics_thread.start() + else: + self.log.info('Cache disabled') # Publish the URI that others may use to access the service we're # about to start serving -- 2.39.5