From d0022cd2ab9d45a9945ca10f6f6599670deed739 Mon Sep 17 00:00:00 2001
From: Nitzan Mordechai <nmordech@redhat.com>
Date: Wed, 20 Aug 2025 14:50:40 +0000
Subject: [PATCH] mgr/prometheus: prune stale health checks, compress output

This patch introduces several improvements to the Prometheus module:

 - Introduces `HealthHistory._prune()` to drop stale and inactive health checks.
  Limits the in-memory healthcheck dict to a configurable max_entries (default 1000).
  TTL for stale entries is configurable via `healthcheck_history_stale_ttl` (default 3600s).

 - Refactors HealthHistory.check() to use a unified iteration over known and current checks,
  improving concurrency and minimizing redundant updates.

 - Use cherrypy.tools.gzip instead of manual gzip.compress() for cleaner
  HTTP compression with proper header handling and client negotiation.

 - Introduces new module options:
    - `healthcheck_history_max_entries`

 - Add proper error handling for CherryPy engine startup failures
 - Remove os._exit monkey patch in favor of proper exception handling
 - Remove manual Content-Type header setting (CherryPy handles automatically)

Fixes: https://tracker.ceph.com/issues/68989
Signed-off-by: Nitzan Mordechai <nmordech@redhat.com>
---
 doc/mgr/prometheus.rst              |   8 ++
 src/pybind/mgr/prometheus/module.py | 134 +++++++++++++++++-----------
 2 files changed, 89 insertions(+), 53 deletions(-)

diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst
index 276f00c3146..8f69e465eb6 100644
--- a/doc/mgr/prometheus.rst
+++ b/doc/mgr/prometheus.rst
@@ -42,6 +42,7 @@ Configuration
 .. confval:: standby_behaviour
 .. confval:: standby_error_status_code
 .. confval:: exclude_perf_counters
+.. confval:: healthcheck_history_max_entries
 
 By default the module will accept HTTP requests on port ``9283`` on all IPv4
 and IPv6 addresses on the host.  The port and listen address are
@@ -152,6 +153,13 @@ The metrics take the following form:
     ceph_health_detail{name="OSD_DOWN",severity="HEALTH_WARN"} 1.0
     ceph_health_detail{name="PG_DEGRADED",severity="HEALTH_WARN"} 1.0
 
+The module also maintains an in-memory history of health-check states.
+By default the history retains a maximum of 1000 entries. This limit is configurable via the following runtime option:
+
+  ``mgr/prometheus/healthcheck_history_max_entries`` - the maximum number of unique health check entries to track in memory (default: 1000).
+
+This setting helps avoid unbounded memory growth in large or long-lived clusters.
+
 The health check history may be retrieved and cleared by running the following commands:
 
 .. prompt:: bash #
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
index a932d3d149b..99d630bcb7a 100644
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -9,6 +9,7 @@ import threading
 import time
 import enum
 from collections import namedtuple
+from collections import OrderedDict
 from tempfile import NamedTemporaryFile
 
 from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
@@ -27,14 +28,6 @@ MetricValue = Dict[LabelValues, Number]
 
 DEFAULT_PORT = 9283
 
-
-# cherrypy likes to sys.exit on error.  don't let it take us down too!
-def os_exit_noop(status: int) -> None:
-    pass
-
-
-os._exit = os_exit_noop   # type: ignore
-
 # to access things in class Module from subclass Root.  Because
 # it's a dict, the writer doesn't need to declare 'global' for access
 
@@ -158,6 +151,17 @@ class HealthCheckEvent:
         """Return the instance as a dictionary."""
         return self.__dict__
 
+class LRUCacheDict(OrderedDict):
+    def __init__(self, maxsize, *args, **kwargs):
+        self.maxsize = maxsize
+        super().__init__(*args, **kwargs)
+
+    def __setitem__(self, key, value):
+        if key in self:
+            self.move_to_end(key)
+        elif len(self) >= self.maxsize:
+            self.popitem(last=False)  # drop oldest
+        super().__setitem__(key, value)
 
 class HealthHistory:
     kv_name = 'health_history'
@@ -167,7 +171,8 @@ class HealthHistory:
     def __init__(self, mgr: MgrModule):
         self.mgr = mgr
         self.lock = threading.Lock()
-        self.healthcheck: Dict[str, HealthCheckEvent] = {}
+        self.max_entries = cast(int, self.mgr.get_localized_module_option('healthcheck_history_max_entries', 1000))
+        self.healthcheck: LRUCacheDict[str, HealthCheckEvent] = LRUCacheDict(maxsize=self.max_entries)
         self._load()
 
     def _load(self) -> None:
@@ -197,7 +202,7 @@ class HealthHistory:
         """Reset the healthcheck history."""
         with self.lock:
             self.mgr.set_store(self.kv_name, "{}")
-            self.healthcheck = {}
+            self.healthcheck.clear()
 
     def save(self) -> None:
         """Save the current in-memory healthcheck history to the KV store."""
@@ -212,43 +217,43 @@ class HealthHistory:
         """
 
         current_checks = health_checks.get('checks', {})
-        changes_made = False
-
-        # first turn off any active states we're tracking
-        for seen_check in self.healthcheck:
-            check = self.healthcheck[seen_check]
-            if check.active and seen_check not in current_checks:
-                check.active = False
-                changes_made = True
-
-        # now look for any additions to track
         now = time.time()
-        for name, info in current_checks.items():
-            if name not in self.healthcheck:
-                # this healthcheck is new, so start tracking it
-                changes_made = True
-                self.healthcheck[name] = HealthCheckEvent(
-                    name=name,
-                    severity=info.get('severity'),
-                    first_seen=now,
-                    last_seen=now,
-                    count=1,
-                    active=True
-                )
-            else:
-                # seen it before, so update its metadata
-                check = self.healthcheck[name]
-                if check.active:
-                    # check has been registered as active already, so skip
+        with self.lock:
+            changes_made = False
+            names = set(self.healthcheck) | set(current_checks)
+
+            for name in names:
+                present = name in current_checks
+                check = self.healthcheck.get(name)
+                if check is None:
+                    if present:
+                        info = current_checks[name]
+                        self.healthcheck[name] = HealthCheckEvent(
+                            name=name,
+                            severity=info.get('severity'),
+                            first_seen=now,
+                            last_seen=now,
+                            count=1,
+                            active=True
+                        )
+                        changes_made = True
+        
                     continue
-                else:
+
+                if present:
+                    if not check.active:
+                        check.count += 1
+                        changes_made = True
+                    
                     check.last_seen = now
-                    check.count += 1
                     check.active = True
-                    changes_made = True
+                else:
+                    if check.active:
+                        check.active = False
+                        changes_made = True
 
-        if changes_made:
-            self.save()
+            if changes_made:
+                self.save()
 
     def __str__(self) -> str:
         """Print the healthcheck history.
@@ -257,7 +262,6 @@ class HealthHistory:
             str: Human readable representation of the healthcheck history
         """
         out = []
-
         if len(self.healthcheck.keys()) == 0:
             out.append("No healthchecks have been recorded")
         else:
@@ -610,7 +614,14 @@ class Module(MgrModule, OrchestratorClientMixin):
             desc='Do not include perf-counters in the metrics output',
             long_desc='Gathering perf-counters from a single Prometheus exporter can degrade ceph-mgr performance, especially in large clusters. Instead, Ceph-exporter daemons are now used by default for perf-counter gathering. This should only be disabled when no ceph-exporters are deployed.',
             runtime=True
-        )
+        ),
+        Option(
+            name='healthcheck_history_max_entries',
+            type='int',
+            default=1000,
+            desc='Maximum number of health check history entries to keep',
+            runtime=True
+        ),
     ]
 
     STALE_CACHE_FAIL = 'fail'
@@ -1924,8 +1935,10 @@ class Module(MgrModule, OrchestratorClientMixin):
                     self.setup_tls_config(server_addr, server_port)
                     return
             except Exception as e:
-                self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n',
-                                   'Falling back to default configuration')
+                self.log.exception(
+                    f'Failed to setup cephadm based secure monitoring stack: {e}\n'
+                    'Falling back to default configuration'
+                )
 
         # In any error fallback to plain http mode
         self.setup_default_config(server_addr, server_port)
@@ -1938,6 +1951,13 @@ class Module(MgrModule, OrchestratorClientMixin):
             'server.ssl_module': None,
             'server.ssl_certificate': None,
             'server.ssl_private_key': None,
+            'tools.gzip.on': True,
+            'tools.gzip.mime_types': [
+                'text/plain',
+                'text/html',
+                'application/json',
+            ],
+            'tools.gzip.compress_level': 6,
         })
         # Publish the URI that others may use to access the service we're about to start serving
         self.set_uri(build_url(scheme='http', host=self.get_server_addr(),
@@ -1979,6 +1999,13 @@ class Module(MgrModule, OrchestratorClientMixin):
             'server.ssl_module': 'builtin',
             'server.ssl_certificate': cert_file_path,
             'server.ssl_private_key': key_file_path,
+            'tools.gzip.on': True,
+            'tools.gzip.mime_types': [
+                'text/plain',
+                'text/html',
+                'application/json',
+            ],
+            'tools.gzip.compress_level': 6,
         })
         # Publish the URI that others may use to access the service we're about to start serving
         self.set_uri(build_url(scheme='https', host=self.get_server_addr(),
@@ -2013,10 +2040,9 @@ class Module(MgrModule, OrchestratorClientMixin):
 
             @staticmethod
             def _metrics(instance: 'Module') -> Optional[str]:
-                if not self.cache:
-                    self.log.debug('Cache disabled, collecting and returning without cache')
-                    cherrypy.response.headers['Content-Type'] = 'text/plain'
-                    return self.collect()
+                if not instance.cache:
+                    instance.log.debug('Cache disabled, collecting and returning without cache')
+                    return instance.collect()
 
                 # Return cached data if available
                 if not instance.collect_cache:
@@ -2024,7 +2050,6 @@ class Module(MgrModule, OrchestratorClientMixin):
 
                 def respond() -> Optional[str]:
                     assert isinstance(instance, Module)
-                    cherrypy.response.headers['Content-Type'] = 'text/plain'
                     return instance.collect_cache
 
                 if instance.collect_time < instance.scrape_interval:
@@ -2082,7 +2107,11 @@ class Module(MgrModule, OrchestratorClientMixin):
 
         cherrypy.tree.mount(Root(), "/")
         self.log.info('Starting engine...')
-        cherrypy.engine.start()
+        try:
+            cherrypy.engine.start()
+        except Exception as e:
+            self.log.error(f'Failed to start engine: {e}')
+            return
         self.log.info('Engine started.')
 
         # wait for the shutdown event
@@ -2177,7 +2206,6 @@ class StandbyModule(MgrStandbyModule):
 
             @cherrypy.expose
             def metrics(self) -> str:
-                cherrypy.response.headers['Content-Type'] = 'text/plain'
                 return ''
 
         cherrypy.tree.mount(Root(), '/', {})
-- 
2.39.5