From: Patrick Seidensal Date: Thu, 26 Aug 2021 13:35:49 +0000 (+0200) Subject: mgr/prometheus: offer ability to disable cache X-Git-Tag: v16.2.7~8^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3999c407cadd4826faacd5d3fe8f165ec614a251;p=ceph.git mgr/prometheus: offer ability to disable cache Fixes: https://tracker.ceph.com/issues/52414 Signed-off-by: Patrick Seidensal (cherry picked from commit df7d30ca5b7965f8ea8e617ef682e9a1168c6ec3) Conflicts: doc/mgr/prometheus.rst - Ignore missing commit f2750a7912a3af22ad35e223006827f2871903c4 --- diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 2c0fff5dcf7d8..0f1caff2353ff 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -60,13 +60,12 @@ To set a different scrape interval in the Prometheus module, set ceph config set mgr mgr/prometheus/scrape_interval 20 On large clusters (>1000 OSDs), the time to fetch the metrics may become -significant. Without the cache, the Prometheus manager module could, -especially in conjunction with multiple Prometheus instances, overload the -manager and lead to unresponsive or crashing Ceph manager instances. Hence, -the cache is enabled by default and cannot be disabled. This means that there -is a possibility that the cache becomes stale. The cache is considered stale -when the time to fetch the metrics from Ceph exceeds the configured -``scrape_interval``. +significant. Without the cache, the Prometheus manager module could, especially +in conjunction with multiple Prometheus instances, overload the manager and lead +to unresponsive or crashing Ceph manager instances. Hence, the cache is enabled +by default. This means that there is a possibility that the cache becomes +stale. The cache is considered stale when the time to fetch the metrics from +Ceph exceeds the configured :confval:``mgr/prometheus/scrape_interval``. If that is the case, **a warning will be logged** and the module will either @@ -85,6 +84,10 @@ To tell the module to respond with "service unavailable", set it to ``fail``:: ceph config set mgr mgr/prometheus/stale_cache_strategy fail +If you are confident that you don't require the cache, you can disable it:: + + ceph config set mgr mgr/prometheus/cache false + .. _prometheus-rbd-io-statistics: RBD IO statistics diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index e6823e3d996c8..896c0f4e3c5ac 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -281,6 +281,11 @@ class Module(MgrModule): 'stale_cache_strategy', default='log' ), + Option( + 'cache', + type='bool', + default=True, + ), Option( 'rbd_stats_pools', default='' @@ -302,6 +307,7 @@ class Module(MgrModule): self.collect_lock = threading.Lock() self.collect_time = 0.0 self.scrape_interval: float = 15.0 + self.cache = True self.stale_cache_strategy: str = self.STALE_CACHE_FAIL self.collect_cache: Optional[str] = None self.rbd_stats = { @@ -1317,6 +1323,11 @@ class Module(MgrModule): @staticmethod def _metrics(instance: 'Module') -> Optional[str]: + if not self.cache: + self.log.debug('Cache disabled, collecting and returning without cache') + cherrypy.response.headers['Content-Type'] = 'text/plain' + return self.collect() + # Return cached data if available if not instance.collect_cache: raise cherrypy.HTTPError(503, 'No cached data available yet') @@ -1372,7 +1383,12 @@ class Module(MgrModule): (server_addr, server_port) ) - self.metrics_thread.start() + self.cache = cast(bool, self.get_localized_module_option('cache', True)) + if self.cache: + self.log.info('Cache enabled') + self.metrics_thread.start() + else: + self.log.info('Cache disabled') cherrypy.config.update({ 'server.socket_host': server_addr,