Fix the prometheus module crash

author Anmol Babu <Anmol.Babu@ibm.com>

Thu, 11 Dec 2025 08:39:52 +0000 (14:09 +0530)

committer Anmol Babu <Anmol.Babu@ibm.com>

Thu, 11 Dec 2025 09:15:39 +0000 (14:45 +0530)
author Anmol Babu <Anmol.Babu@ibm.com>
Thu, 11 Dec 2025 08:39:52 +0000 (14:09 +0530)
committer Anmol Babu <Anmol.Babu@ibm.com>
Thu, 11 Dec 2025 09:15:39 +0000 (14:45 +0530)
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index 28e842a095693c44e5ec2efc1518203c68d5519f..f3dabfe43d301cc09ac56d735e40bc7cabbe5178 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -12,7 +12,7 @@ from collections import OrderedDict
  from tempfile import NamedTemporaryFile
  
  from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
-from mgr_util import get_default_addr, profile_method, build_url
+from mgr_util import get_default_addr, profile_method, build_url, test_port_allocation, PortAlreadyInUse
  from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError
  from rbd import RBD
  
@@ -31,6 +31,43 @@ DEFAULT_PORT = 9283
  # it's a dict, the writer doesn't need to declare 'global' for access
  
  _global_instance = None  # type: Optional[Module]
+# Lock to serialize CherryPy engine start/stop operations across threads
+# This prevents race conditions when config_notify and serve run concurrently
+_engine_lock = threading.Lock()
+
+# Configuration for port availability waiting
+PORT_WAIT_MAX_TIME = 10.0  # Maximum time to wait for port to become available
+PORT_WAIT_INTERVAL = 0.5   # Polling interval
+
+
+def _wait_for_port_available(
+    log: Any,
+    server_addr: str,
+    server_port: int,
+    max_wait_time: float = PORT_WAIT_MAX_TIME
+) -> bool:
+    """
+    Wait for a port to become available using test_port_allocation.
+
+    Returns:
+        True if port became available, False if timeout reached.
+    """
+    elapsed = 0.0
+    while elapsed < max_wait_time:
+        try:
+            test_port_allocation(server_addr, server_port)
+            return True  # Port is available
+        except PortAlreadyInUse:
+            log.debug(f'Port {server_port} still in use, waiting...')
+            time.sleep(PORT_WAIT_INTERVAL)
+            elapsed += PORT_WAIT_INTERVAL
+        except Exception as e:
+            # For other errors (e.g., invalid address), just return and let CherryPy handle it
+            log.debug(f'Port check failed with: {e}')
+            return True
+    return False
+
+
  cherrypy.config.update({
      'response.headers.server': 'Ceph-Prometheus'
  })
@@ -941,14 +978,18 @@ class Module(MgrModule, OrchestratorClientMixin):
          # https://stackoverflow.com/questions/7254845/change-cherrypy-port-and-restart-web-server
          # if we omit the line: cherrypy.server.httpserver = None
          # then the cherrypy server is not restarted correctly
-        self.log.info('Restarting engine...')
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
-        server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
-        self.configure(server_addr, server_port)
-        cherrypy.engine.start()
-        self.log.info('Engine started.')
+        with _engine_lock:
+            self.log.info('Restarting engine...')
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
+            server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
+            self.configure(server_addr, server_port)
+            # Wait for port to be available before starting
+            if not _wait_for_port_available(self.log, server_addr, server_port):
+                self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+            cherrypy.engine.start()
+            self.log.info('Engine restarted.')
  
      @profile_method()
      def get_health(self) -> None:
@@ -2116,22 +2157,31 @@ class Module(MgrModule, OrchestratorClientMixin):
          self.configure(server_addr, server_port)
  
          cherrypy.tree.mount(Root(), "/")
-        self.log.info('Starting engine...')
-        try:
-            cherrypy.engine.start()
-        except Exception as e:
-            self.log.error(f'Failed to start engine: {e}')
-            return
-        self.log.info('Engine started.')
+        with _engine_lock:
+            # Check if engine is already running (could happen if config_notify started it first)
+            if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+                self.log.info('Engine already running, skipping start')
+            else:
+                # Wait for port to be available before starting (handles standby->active transition)
+                if not _wait_for_port_available(self.log, server_addr, server_port):
+                    self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+                self.log.info('Starting engine...')
+                try:
+                    cherrypy.engine.start()
+                except Exception as e:
+                    self.log.error(f'Failed to start engine: {e}')
+                    return
+                self.log.info('Engine started.')
  
          # wait for the shutdown event
          self.shutdown_event.wait()
          self.shutdown_event.clear()
          # tell metrics collection thread to stop collecting new metrics
          self.metrics_thread.stop()
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        self.log.info('Engine stopped.')
+        with _engine_lock:
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            self.log.info('Engine stopped.')
          self.shutdown_rbd_stats()
          # wait for the metrics collection thread to stop
          self.metrics_thread.join()
@@ -2219,15 +2269,24 @@ class StandbyModule(MgrStandbyModule):
                  return ''
  
          cherrypy.tree.mount(Root(), '/', {})
-        self.log.info('Starting engine...')
-        cherrypy.engine.start()
-        self.log.info('Engine started.')
+        with _engine_lock:
+            # Check if engine is already running
+            if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+                self.log.info('Engine already running, skipping start')
+            else:
+                # Wait for port to be available before starting
+                if not _wait_for_port_available(self.log, server_addr, server_port):
+                    self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+                self.log.info('Starting engine...')
+                cherrypy.engine.start()
+                self.log.info('Engine started.')
          # Wait for shutdown event
          self.shutdown_event.wait()
          self.shutdown_event.clear()
-        cherrypy.engine.stop()
-        cherrypy.server.httpserver = None
-        self.log.info('Engine stopped.')
+        with _engine_lock:
+            cherrypy.engine.stop()
+            cherrypy.server.httpserver = None
+            self.log.info('Engine stopped.')
  
      def shutdown(self) -> None:
          self.log.info("Stopping engine...")
author	Anmol Babu <Anmol.Babu@ibm.com>
	Thu, 11 Dec 2025 08:39:52 +0000 (14:09 +0530)
committer	Anmol Babu <Anmol.Babu@ibm.com>
	Thu, 11 Dec 2025 09:15:39 +0000 (14:45 +0530)