From: Shweta Bhosale Date: Tue, 28 Apr 2026 04:52:08 +0000 (+0530) Subject: mgr/cephadm: Skip RDMA device check for NFS during upgrade X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=523c3a22124514b843f2ae99de936e8d5b30e137;p=ceph.git mgr/cephadm: Skip RDMA device check for NFS during upgrade During image upgrade, prepare_create run on the asyncio event-loop thread while an outer wait_async is active. Calling wait_async again for cephadm list-rdma on that thread blocks the loop and can hang or time out. Fixes: https://tracker.ceph.com/issues/76284 Signed-off-by: Shweta Bhosale --- diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index ffc4fff849b1..1bbdb25d75e8 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -197,24 +197,31 @@ class NFSService(CephService): if spec.enable_rdma: from cephadm.serve import CephadmServe - rdma_devices = self.mgr.wait_async( - CephadmServe(self.mgr).get_rdma_devices(host)) - if not rdma_devices: - raise OrchestratorError( - f'NFS RDMA is enabled but host {host} has no RDMA devices. ' - "Run 'cephadm list-rdma' on the host to verify RDMA is available." + # During a cluster upgrade, prepare_create run on the asyncio + # event-loop thread; a nested wait_async(cephadm list-rdma) there would + # block the loop. Skip the check while upgrade_state is set. + if self.mgr.upgrade.upgrade_state is not None: + self.mgr.log.info('NFS: RDMA list-rdma skipped (cluster upgrade in progress)') + else: + rdma_devices = self.mgr.wait_async( + CephadmServe(self.mgr).get_rdma_devices(host) ) - if bind_addr: - bind_ip = bind_addr.split('/')[0] - iface = self.mgr.cache.get_interface_for_ip(host, bind_ip) - rdma_netdevs = {d.get('netdev', '') for d in rdma_devices} - if iface and iface not in rdma_netdevs: + if not rdma_devices: raise OrchestratorError( - f'NFS RDMA is enabled with bind address {bind_addr} on host {host}, ' - f'but interface {iface} (for this IP) is not RDMA-capable. ' - f'RDMA netdevs on host: {sorted(rdma_netdevs)}. ' - "Use an IP on an RDMA-capable interface or run 'rdma link show' on the host." + f'NFS RDMA is enabled but host {host} has no RDMA devices. ' + "Run 'cephadm list-rdma' on the host to verify RDMA is available." ) + if bind_addr: + bind_ip = bind_addr.split('/')[0] + iface = self.mgr.cache.get_interface_for_ip(host, bind_ip) + rdma_netdevs = {d.get('netdev', '') for d in rdma_devices} + if iface and iface not in rdma_netdevs: + raise OrchestratorError( + f'NFS RDMA is enabled with bind address {bind_addr} on host {host}, ' + f'but interface {iface} (for this IP) is not RDMA-capable. ' + f'RDMA netdevs on host: {sorted(rdma_netdevs)}. ' + "Use an IP on an RDMA-capable interface or run 'rdma link show' on the host." + ) if monitoring_ip: daemon_spec.port_ips.update({str(monitoring_port): monitoring_ip})