From: Anmol Babu Date: Thu, 11 Dec 2025 08:39:52 +0000 (+0530) Subject: Fix the prometheus module crash X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fheads%2Fwip-aashish-prom-module-main;p=ceph-ci.git Fix the prometheus module crash fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2414677 Signed-off-by: Anmol Babu --- diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 28e842a0956..f3dabfe43d3 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -12,7 +12,7 @@ from collections import OrderedDict from tempfile import NamedTemporaryFile from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand -from mgr_util import get_default_addr, profile_method, build_url +from mgr_util import get_default_addr, profile_method, build_url, test_port_allocation, PortAlreadyInUse from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError from rbd import RBD @@ -31,6 +31,43 @@ DEFAULT_PORT = 9283 # it's a dict, the writer doesn't need to declare 'global' for access _global_instance = None # type: Optional[Module] +# Lock to serialize CherryPy engine start/stop operations across threads +# This prevents race conditions when config_notify and serve run concurrently +_engine_lock = threading.Lock() + +# Configuration for port availability waiting +PORT_WAIT_MAX_TIME = 10.0 # Maximum time to wait for port to become available +PORT_WAIT_INTERVAL = 0.5 # Polling interval + + +def _wait_for_port_available( + log: Any, + server_addr: str, + server_port: int, + max_wait_time: float = PORT_WAIT_MAX_TIME +) -> bool: + """ + Wait for a port to become available using test_port_allocation. + + Returns: + True if port became available, False if timeout reached. + """ + elapsed = 0.0 + while elapsed < max_wait_time: + try: + test_port_allocation(server_addr, server_port) + return True # Port is available + except PortAlreadyInUse: + log.debug(f'Port {server_port} still in use, waiting...') + time.sleep(PORT_WAIT_INTERVAL) + elapsed += PORT_WAIT_INTERVAL + except Exception as e: + # For other errors (e.g., invalid address), just return and let CherryPy handle it + log.debug(f'Port check failed with: {e}') + return True + return False + + cherrypy.config.update({ 'response.headers.server': 'Ceph-Prometheus' }) @@ -941,14 +978,18 @@ class Module(MgrModule, OrchestratorClientMixin): # https://stackoverflow.com/questions/7254845/change-cherrypy-port-and-restart-web-server # if we omit the line: cherrypy.server.httpserver = None # then the cherrypy server is not restarted correctly - self.log.info('Restarting engine...') - cherrypy.engine.stop() - cherrypy.server.httpserver = None - server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr())) - server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT)) - self.configure(server_addr, server_port) - cherrypy.engine.start() - self.log.info('Engine started.') + with _engine_lock: + self.log.info('Restarting engine...') + cherrypy.engine.stop() + cherrypy.server.httpserver = None + server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr())) + server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT)) + self.configure(server_addr, server_port) + # Wait for port to be available before starting + if not _wait_for_port_available(self.log, server_addr, server_port): + self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway') + cherrypy.engine.start() + self.log.info('Engine restarted.') @profile_method() def get_health(self) -> None: @@ -2116,22 +2157,31 @@ class Module(MgrModule, OrchestratorClientMixin): self.configure(server_addr, server_port) cherrypy.tree.mount(Root(), "/") - self.log.info('Starting engine...') - try: - cherrypy.engine.start() - except Exception as e: - self.log.error(f'Failed to start engine: {e}') - return - self.log.info('Engine started.') + with _engine_lock: + # Check if engine is already running (could happen if config_notify started it first) + if cherrypy.engine.state == cherrypy.engine.states.STARTED: + self.log.info('Engine already running, skipping start') + else: + # Wait for port to be available before starting (handles standby->active transition) + if not _wait_for_port_available(self.log, server_addr, server_port): + self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway') + self.log.info('Starting engine...') + try: + cherrypy.engine.start() + except Exception as e: + self.log.error(f'Failed to start engine: {e}') + return + self.log.info('Engine started.') # wait for the shutdown event self.shutdown_event.wait() self.shutdown_event.clear() # tell metrics collection thread to stop collecting new metrics self.metrics_thread.stop() - cherrypy.engine.stop() - cherrypy.server.httpserver = None - self.log.info('Engine stopped.') + with _engine_lock: + cherrypy.engine.stop() + cherrypy.server.httpserver = None + self.log.info('Engine stopped.') self.shutdown_rbd_stats() # wait for the metrics collection thread to stop self.metrics_thread.join() @@ -2219,15 +2269,24 @@ class StandbyModule(MgrStandbyModule): return '' cherrypy.tree.mount(Root(), '/', {}) - self.log.info('Starting engine...') - cherrypy.engine.start() - self.log.info('Engine started.') + with _engine_lock: + # Check if engine is already running + if cherrypy.engine.state == cherrypy.engine.states.STARTED: + self.log.info('Engine already running, skipping start') + else: + # Wait for port to be available before starting + if not _wait_for_port_available(self.log, server_addr, server_port): + self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway') + self.log.info('Starting engine...') + cherrypy.engine.start() + self.log.info('Engine started.') # Wait for shutdown event self.shutdown_event.wait() self.shutdown_event.clear() - cherrypy.engine.stop() - cherrypy.server.httpserver = None - self.log.info('Engine stopped.') + with _engine_lock: + cherrypy.engine.stop() + cherrypy.server.httpserver = None + self.log.info('Engine stopped.') def shutdown(self) -> None: self.log.info("Stopping engine...")