]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Skip RDMA device check for NFS during upgrade
authorShweta Bhosale <Shweta.Bhosale1@ibm.com>
Tue, 28 Apr 2026 04:52:08 +0000 (10:22 +0530)
committerShweta Bhosale <Shweta.Bhosale1@ibm.com>
Tue, 28 Apr 2026 04:58:54 +0000 (10:28 +0530)
During image upgrade, prepare_create run on the asyncio event-loop
thread while an outer wait_async is active. Calling wait_async again for
cephadm list-rdma on that thread blocks the loop and can hang or time out.

Fixes: https://tracker.ceph.com/issues/76284
Signed-off-by: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
src/pybind/mgr/cephadm/services/nfs.py

index ffc4fff849b12baab7cb0467bcfe5eedc8541169..1bbdb25d75e84a555c733ec09ca2ccd4a7124f91 100644 (file)
@@ -197,24 +197,31 @@ class NFSService(CephService):
 
         if spec.enable_rdma:
             from cephadm.serve import CephadmServe
-            rdma_devices = self.mgr.wait_async(
-                CephadmServe(self.mgr).get_rdma_devices(host))
-            if not rdma_devices:
-                raise OrchestratorError(
-                    f'NFS RDMA is enabled but host {host} has no RDMA devices. '
-                    "Run 'cephadm list-rdma' on the host to verify RDMA is available."
+            # During a cluster upgrade, prepare_create run on the asyncio
+            # event-loop thread; a nested wait_async(cephadm list-rdma) there would
+            # block the loop. Skip the check while upgrade_state is set.
+            if self.mgr.upgrade.upgrade_state is not None:
+                self.mgr.log.info('NFS: RDMA list-rdma skipped (cluster upgrade in progress)')
+            else:
+                rdma_devices = self.mgr.wait_async(
+                    CephadmServe(self.mgr).get_rdma_devices(host)
                 )
-            if bind_addr:
-                bind_ip = bind_addr.split('/')[0]
-                iface = self.mgr.cache.get_interface_for_ip(host, bind_ip)
-                rdma_netdevs = {d.get('netdev', '') for d in rdma_devices}
-                if iface and iface not in rdma_netdevs:
+                if not rdma_devices:
                     raise OrchestratorError(
-                        f'NFS RDMA is enabled with bind address {bind_addr} on host {host}, '
-                        f'but interface {iface} (for this IP) is not RDMA-capable. '
-                        f'RDMA netdevs on host: {sorted(rdma_netdevs)}. '
-                        "Use an IP on an RDMA-capable interface or run 'rdma link show' on the host."
+                        f'NFS RDMA is enabled but host {host} has no RDMA devices. '
+                        "Run 'cephadm list-rdma' on the host to verify RDMA is available."
                     )
+                if bind_addr:
+                    bind_ip = bind_addr.split('/')[0]
+                    iface = self.mgr.cache.get_interface_for_ip(host, bind_ip)
+                    rdma_netdevs = {d.get('netdev', '') for d in rdma_devices}
+                    if iface and iface not in rdma_netdevs:
+                        raise OrchestratorError(
+                            f'NFS RDMA is enabled with bind address {bind_addr} on host {host}, '
+                            f'but interface {iface} (for this IP) is not RDMA-capable. '
+                            f'RDMA netdevs on host: {sorted(rdma_netdevs)}. '
+                            "Use an IP on an RDMA-capable interface or run 'rdma link show' on the host."
+                        )
 
         if monitoring_ip:
             daemon_spec.port_ips.update({str(monitoring_port): monitoring_ip})