From d0022cd2ab9d45a9945ca10f6f6599670deed739 Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Wed, 20 Aug 2025 14:50:40 +0000 Subject: [PATCH] mgr/prometheus: prune stale health checks, compress output This patch introduces several improvements to the Prometheus module: - Introduces `HealthHistory._prune()` to drop stale and inactive health checks. Limits the in-memory healthcheck dict to a configurable max_entries (default 1000). TTL for stale entries is configurable via `healthcheck_history_stale_ttl` (default 3600s). - Refactors HealthHistory.check() to use a unified iteration over known and current checks, improving concurrency and minimizing redundant updates. - Use cherrypy.tools.gzip instead of manual gzip.compress() for cleaner HTTP compression with proper header handling and client negotiation. - Introduces new module options: - `healthcheck_history_max_entries` - Add proper error handling for CherryPy engine startup failures - Remove os._exit monkey patch in favor of proper exception handling - Remove manual Content-Type header setting (CherryPy handles automatically) Fixes: https://tracker.ceph.com/issues/68989 Signed-off-by: Nitzan Mordechai --- doc/mgr/prometheus.rst | 8 ++ src/pybind/mgr/prometheus/module.py | 134 +++++++++++++++++----------- 2 files changed, 89 insertions(+), 53 deletions(-) diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 276f00c3146..8f69e465eb6 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -42,6 +42,7 @@ Configuration .. confval:: standby_behaviour .. confval:: standby_error_status_code .. confval:: exclude_perf_counters +.. confval:: healthcheck_history_max_entries By default the module will accept HTTP requests on port ``9283`` on all IPv4 and IPv6 addresses on the host. The port and listen address are @@ -152,6 +153,13 @@ The metrics take the following form: ceph_health_detail{name="OSD_DOWN",severity="HEALTH_WARN"} 1.0 ceph_health_detail{name="PG_DEGRADED",severity="HEALTH_WARN"} 1.0 +The module also maintains an in-memory history of health-check states. +By default the history retains a maximum of 1000 entries. This limit is configurable via the following runtime option: + + ``mgr/prometheus/healthcheck_history_max_entries`` - the maximum number of unique health check entries to track in memory (default: 1000). + +This setting helps avoid unbounded memory growth in large or long-lived clusters. + The health check history may be retrieved and cleared by running the following commands: .. prompt:: bash # diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index a932d3d149b..99d630bcb7a 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -9,6 +9,7 @@ import threading import time import enum from collections import namedtuple +from collections import OrderedDict from tempfile import NamedTemporaryFile from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand @@ -27,14 +28,6 @@ MetricValue = Dict[LabelValues, Number] DEFAULT_PORT = 9283 - -# cherrypy likes to sys.exit on error. don't let it take us down too! -def os_exit_noop(status: int) -> None: - pass - - -os._exit = os_exit_noop # type: ignore - # to access things in class Module from subclass Root. Because # it's a dict, the writer doesn't need to declare 'global' for access @@ -158,6 +151,17 @@ class HealthCheckEvent: """Return the instance as a dictionary.""" return self.__dict__ +class LRUCacheDict(OrderedDict): + def __init__(self, maxsize, *args, **kwargs): + self.maxsize = maxsize + super().__init__(*args, **kwargs) + + def __setitem__(self, key, value): + if key in self: + self.move_to_end(key) + elif len(self) >= self.maxsize: + self.popitem(last=False) # drop oldest + super().__setitem__(key, value) class HealthHistory: kv_name = 'health_history' @@ -167,7 +171,8 @@ class HealthHistory: def __init__(self, mgr: MgrModule): self.mgr = mgr self.lock = threading.Lock() - self.healthcheck: Dict[str, HealthCheckEvent] = {} + self.max_entries = cast(int, self.mgr.get_localized_module_option('healthcheck_history_max_entries', 1000)) + self.healthcheck: LRUCacheDict[str, HealthCheckEvent] = LRUCacheDict(maxsize=self.max_entries) self._load() def _load(self) -> None: @@ -197,7 +202,7 @@ class HealthHistory: """Reset the healthcheck history.""" with self.lock: self.mgr.set_store(self.kv_name, "{}") - self.healthcheck = {} + self.healthcheck.clear() def save(self) -> None: """Save the current in-memory healthcheck history to the KV store.""" @@ -212,43 +217,43 @@ class HealthHistory: """ current_checks = health_checks.get('checks', {}) - changes_made = False - - # first turn off any active states we're tracking - for seen_check in self.healthcheck: - check = self.healthcheck[seen_check] - if check.active and seen_check not in current_checks: - check.active = False - changes_made = True - - # now look for any additions to track now = time.time() - for name, info in current_checks.items(): - if name not in self.healthcheck: - # this healthcheck is new, so start tracking it - changes_made = True - self.healthcheck[name] = HealthCheckEvent( - name=name, - severity=info.get('severity'), - first_seen=now, - last_seen=now, - count=1, - active=True - ) - else: - # seen it before, so update its metadata - check = self.healthcheck[name] - if check.active: - # check has been registered as active already, so skip + with self.lock: + changes_made = False + names = set(self.healthcheck) | set(current_checks) + + for name in names: + present = name in current_checks + check = self.healthcheck.get(name) + if check is None: + if present: + info = current_checks[name] + self.healthcheck[name] = HealthCheckEvent( + name=name, + severity=info.get('severity'), + first_seen=now, + last_seen=now, + count=1, + active=True + ) + changes_made = True + continue - else: + + if present: + if not check.active: + check.count += 1 + changes_made = True + check.last_seen = now - check.count += 1 check.active = True - changes_made = True + else: + if check.active: + check.active = False + changes_made = True - if changes_made: - self.save() + if changes_made: + self.save() def __str__(self) -> str: """Print the healthcheck history. @@ -257,7 +262,6 @@ class HealthHistory: str: Human readable representation of the healthcheck history """ out = [] - if len(self.healthcheck.keys()) == 0: out.append("No healthchecks have been recorded") else: @@ -610,7 +614,14 @@ class Module(MgrModule, OrchestratorClientMixin): desc='Do not include perf-counters in the metrics output', long_desc='Gathering perf-counters from a single Prometheus exporter can degrade ceph-mgr performance, especially in large clusters. Instead, Ceph-exporter daemons are now used by default for perf-counter gathering. This should only be disabled when no ceph-exporters are deployed.', runtime=True - ) + ), + Option( + name='healthcheck_history_max_entries', + type='int', + default=1000, + desc='Maximum number of health check history entries to keep', + runtime=True + ), ] STALE_CACHE_FAIL = 'fail' @@ -1924,8 +1935,10 @@ class Module(MgrModule, OrchestratorClientMixin): self.setup_tls_config(server_addr, server_port) return except Exception as e: - self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n', - 'Falling back to default configuration') + self.log.exception( + f'Failed to setup cephadm based secure monitoring stack: {e}\n' + 'Falling back to default configuration' + ) # In any error fallback to plain http mode self.setup_default_config(server_addr, server_port) @@ -1938,6 +1951,13 @@ class Module(MgrModule, OrchestratorClientMixin): 'server.ssl_module': None, 'server.ssl_certificate': None, 'server.ssl_private_key': None, + 'tools.gzip.on': True, + 'tools.gzip.mime_types': [ + 'text/plain', + 'text/html', + 'application/json', + ], + 'tools.gzip.compress_level': 6, }) # Publish the URI that others may use to access the service we're about to start serving self.set_uri(build_url(scheme='http', host=self.get_server_addr(), @@ -1979,6 +1999,13 @@ class Module(MgrModule, OrchestratorClientMixin): 'server.ssl_module': 'builtin', 'server.ssl_certificate': cert_file_path, 'server.ssl_private_key': key_file_path, + 'tools.gzip.on': True, + 'tools.gzip.mime_types': [ + 'text/plain', + 'text/html', + 'application/json', + ], + 'tools.gzip.compress_level': 6, }) # Publish the URI that others may use to access the service we're about to start serving self.set_uri(build_url(scheme='https', host=self.get_server_addr(), @@ -2013,10 +2040,9 @@ class Module(MgrModule, OrchestratorClientMixin): @staticmethod def _metrics(instance: 'Module') -> Optional[str]: - if not self.cache: - self.log.debug('Cache disabled, collecting and returning without cache') - cherrypy.response.headers['Content-Type'] = 'text/plain' - return self.collect() + if not instance.cache: + instance.log.debug('Cache disabled, collecting and returning without cache') + return instance.collect() # Return cached data if available if not instance.collect_cache: @@ -2024,7 +2050,6 @@ class Module(MgrModule, OrchestratorClientMixin): def respond() -> Optional[str]: assert isinstance(instance, Module) - cherrypy.response.headers['Content-Type'] = 'text/plain' return instance.collect_cache if instance.collect_time < instance.scrape_interval: @@ -2082,7 +2107,11 @@ class Module(MgrModule, OrchestratorClientMixin): cherrypy.tree.mount(Root(), "/") self.log.info('Starting engine...') - cherrypy.engine.start() + try: + cherrypy.engine.start() + except Exception as e: + self.log.error(f'Failed to start engine: {e}') + return self.log.info('Engine started.') # wait for the shutdown event @@ -2177,7 +2206,6 @@ class StandbyModule(MgrStandbyModule): @cherrypy.expose def metrics(self) -> str: - cherrypy.response.headers['Content-Type'] = 'text/plain' return '' cherrypy.tree.mount(Root(), '/', {}) -- 2.39.5