]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Fix the prometheus module crash wip-aashish-prom-module-main
authorAnmol Babu <Anmol.Babu@ibm.com>
Thu, 11 Dec 2025 08:39:52 +0000 (14:09 +0530)
committerAnmol Babu <Anmol.Babu@ibm.com>
Thu, 11 Dec 2025 09:15:39 +0000 (14:45 +0530)
fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2414677
Signed-off-by: Anmol Babu <Anmol.Babu@ibm.com>
src/pybind/mgr/prometheus/module.py

index 28e842a095693c44e5ec2efc1518203c68d5519f..f3dabfe43d301cc09ac56d735e40bc7cabbe5178 100644 (file)
@@ -12,7 +12,7 @@ from collections import OrderedDict
 from tempfile import NamedTemporaryFile
 
 from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
-from mgr_util import get_default_addr, profile_method, build_url
+from mgr_util import get_default_addr, profile_method, build_url, test_port_allocation, PortAlreadyInUse
 from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError
 from rbd import RBD
 
@@ -31,6 +31,43 @@ DEFAULT_PORT = 9283
 # it's a dict, the writer doesn't need to declare 'global' for access
 
 _global_instance = None  # type: Optional[Module]
+# Lock to serialize CherryPy engine start/stop operations across threads
+# This prevents race conditions when config_notify and serve run concurrently
+_engine_lock = threading.Lock()
+
+# Configuration for port availability waiting
+PORT_WAIT_MAX_TIME = 10.0  # Maximum time to wait for port to become available
+PORT_WAIT_INTERVAL = 0.5   # Polling interval
+
+
+def _wait_for_port_available(
+    log: Any,
+    server_addr: str,
+    server_port: int,
+    max_wait_time: float = PORT_WAIT_MAX_TIME
+) -> bool:
+    """
+    Wait for a port to become available using test_port_allocation.
+
+    Returns:
+        True if port became available, False if timeout reached.
+    """
+    elapsed = 0.0
+    while elapsed < max_wait_time:
+        try:
+            test_port_allocation(server_addr, server_port)
+            return True  # Port is available
+        except PortAlreadyInUse:
+            log.debug(f'Port {server_port} still in use, waiting...')
+            time.sleep(PORT_WAIT_INTERVAL)
+            elapsed += PORT_WAIT_INTERVAL
+        except Exception as e:
+            # For other errors (e.g., invalid address), just return and let CherryPy handle it
+            log.debug(f'Port check failed with: {e}')
+            return True
+    return False
+
+
 cherrypy.config.update({
     'response.headers.server': 'Ceph-Prometheus'
 })
@@ -941,14 +978,18 @@ class Module(MgrModule, OrchestratorClientMixin):
         # https://stackoverflow.com/questions/7254845/change-cherrypy-port-and-restart-web-server
         # if we omit the line: cherrypy.server.httpserver = None
         # then the cherrypy server is not restarted correctly
-        self.log.info('Restarting engine...')
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
-        server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
-        self.configure(server_addr, server_port)
-        cherrypy.engine.start()
-        self.log.info('Engine started.')
+        with _engine_lock:
+            self.log.info('Restarting engine...')
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
+            server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
+            self.configure(server_addr, server_port)
+            # Wait for port to be available before starting
+            if not _wait_for_port_available(self.log, server_addr, server_port):
+                self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+            cherrypy.engine.start()
+            self.log.info('Engine restarted.')
 
     @profile_method()
     def get_health(self) -> None:
@@ -2116,22 +2157,31 @@ class Module(MgrModule, OrchestratorClientMixin):
         self.configure(server_addr, server_port)
 
         cherrypy.tree.mount(Root(), "/")
-        self.log.info('Starting engine...')
-        try:
-            cherrypy.engine.start()
-        except Exception as e:
-            self.log.error(f'Failed to start engine: {e}')
-            return
-        self.log.info('Engine started.')
+        with _engine_lock:
+            # Check if engine is already running (could happen if config_notify started it first)
+            if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+                self.log.info('Engine already running, skipping start')
+            else:
+                # Wait for port to be available before starting (handles standby->active transition)
+                if not _wait_for_port_available(self.log, server_addr, server_port):
+                    self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+                self.log.info('Starting engine...')
+                try:
+                    cherrypy.engine.start()
+                except Exception as e:
+                    self.log.error(f'Failed to start engine: {e}')
+                    return
+                self.log.info('Engine started.')
 
         # wait for the shutdown event
         self.shutdown_event.wait()
         self.shutdown_event.clear()
         # tell metrics collection thread to stop collecting new metrics
         self.metrics_thread.stop()
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        self.log.info('Engine stopped.')
+        with _engine_lock:
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            self.log.info('Engine stopped.')
         self.shutdown_rbd_stats()
         # wait for the metrics collection thread to stop
         self.metrics_thread.join()
@@ -2219,15 +2269,24 @@ class StandbyModule(MgrStandbyModule):
                 return ''
 
         cherrypy.tree.mount(Root(), '/', {})
-        self.log.info('Starting engine...')
-        cherrypy.engine.start()
-        self.log.info('Engine started.')
+        with _engine_lock:
+            # Check if engine is already running
+            if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+                self.log.info('Engine already running, skipping start')
+            else:
+                # Wait for port to be available before starting
+                if not _wait_for_port_available(self.log, server_addr, server_port):
+                    self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+                self.log.info('Starting engine...')
+                cherrypy.engine.start()
+                self.log.info('Engine started.')
         # Wait for shutdown event
         self.shutdown_event.wait()
         self.shutdown_event.clear()
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        self.log.info('Engine stopped.')
+        with _engine_lock:
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            self.log.info('Engine stopped.')
 
     def shutdown(self) -> None:
         self.log.info("Stopping engine...")