from tempfile import NamedTemporaryFile
from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand
-from mgr_util import get_default_addr, profile_method, build_url
+from mgr_util import get_default_addr, profile_method, build_url, test_port_allocation, PortAlreadyInUse
from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError
from rbd import RBD
# it's a dict, the writer doesn't need to declare 'global' for access
_global_instance = None # type: Optional[Module]
+# Lock to serialize CherryPy engine start/stop operations across threads
+# This prevents race conditions when config_notify and serve run concurrently
+_engine_lock = threading.Lock()
+
+# Configuration for port availability waiting
+PORT_WAIT_MAX_TIME = 10.0 # Maximum time to wait for port to become available
+PORT_WAIT_INTERVAL = 0.5 # Polling interval
+
+
+def _wait_for_port_available(
+ log: Any,
+ server_addr: str,
+ server_port: int,
+ max_wait_time: float = PORT_WAIT_MAX_TIME
+) -> bool:
+ """
+ Wait for a port to become available using test_port_allocation.
+
+ Returns:
+ True if port became available, False if timeout reached.
+ """
+ elapsed = 0.0
+ while elapsed < max_wait_time:
+ try:
+ test_port_allocation(server_addr, server_port)
+ return True # Port is available
+ except PortAlreadyInUse:
+ log.debug(f'Port {server_port} still in use, waiting...')
+ time.sleep(PORT_WAIT_INTERVAL)
+ elapsed += PORT_WAIT_INTERVAL
+ except Exception as e:
+ # For other errors (e.g., invalid address), just return and let CherryPy handle it
+ log.debug(f'Port check failed with: {e}')
+ return True
+ return False
+
+
cherrypy.config.update({
'response.headers.server': 'Ceph-Prometheus'
})
# https://stackoverflow.com/questions/7254845/change-cherrypy-port-and-restart-web-server
# if we omit the line: cherrypy.server.httpserver = None
# then the cherrypy server is not restarted correctly
- self.log.info('Restarting engine...')
- cherrypy.engine.stop()
- cherrypy.server.httpserver = None
- server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
- server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
- self.configure(server_addr, server_port)
- cherrypy.engine.start()
- self.log.info('Engine started.')
+ with _engine_lock:
+ self.log.info('Restarting engine...')
+ cherrypy.engine.stop()
+ cherrypy.server.httpserver = None
+ server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
+ server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
+ self.configure(server_addr, server_port)
+ # Wait for port to be available before starting
+ if not _wait_for_port_available(self.log, server_addr, server_port):
+ self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+ cherrypy.engine.start()
+ self.log.info('Engine restarted.')
@profile_method()
def get_health(self) -> None:
self.configure(server_addr, server_port)
cherrypy.tree.mount(Root(), "/")
- self.log.info('Starting engine...')
- try:
- cherrypy.engine.start()
- except Exception as e:
- self.log.error(f'Failed to start engine: {e}')
- return
- self.log.info('Engine started.')
+ with _engine_lock:
+ # Check if engine is already running (could happen if config_notify started it first)
+ if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+ self.log.info('Engine already running, skipping start')
+ else:
+ # Wait for port to be available before starting (handles standby->active transition)
+ if not _wait_for_port_available(self.log, server_addr, server_port):
+ self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+ self.log.info('Starting engine...')
+ try:
+ cherrypy.engine.start()
+ except Exception as e:
+ self.log.error(f'Failed to start engine: {e}')
+ return
+ self.log.info('Engine started.')
# wait for the shutdown event
self.shutdown_event.wait()
self.shutdown_event.clear()
# tell metrics collection thread to stop collecting new metrics
self.metrics_thread.stop()
- cherrypy.engine.stop()
- cherrypy.server.httpserver = None
- self.log.info('Engine stopped.')
+ with _engine_lock:
+ cherrypy.engine.stop()
+ cherrypy.server.httpserver = None
+ self.log.info('Engine stopped.')
self.shutdown_rbd_stats()
# wait for the metrics collection thread to stop
self.metrics_thread.join()
return ''
cherrypy.tree.mount(Root(), '/', {})
- self.log.info('Starting engine...')
- cherrypy.engine.start()
- self.log.info('Engine started.')
+ with _engine_lock:
+ # Check if engine is already running
+ if cherrypy.engine.state == cherrypy.engine.states.STARTED:
+ self.log.info('Engine already running, skipping start')
+ else:
+ # Wait for port to be available before starting
+ if not _wait_for_port_available(self.log, server_addr, server_port):
+ self.log.warning(f'Port {server_port} still in use after waiting, attempting to start anyway')
+ self.log.info('Starting engine...')
+ cherrypy.engine.start()
+ self.log.info('Engine started.')
# Wait for shutdown event
self.shutdown_event.wait()
self.shutdown_event.clear()
- cherrypy.engine.stop()
- cherrypy.server.httpserver = None
- self.log.info('Engine stopped.')
+ with _engine_lock:
+ cherrypy.engine.stop()
+ cherrypy.server.httpserver = None
+ self.log.info('Engine stopped.')
def shutdown(self) -> None:
self.log.info("Stopping engine...")