]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Add precheck for RDMA devices when deploying NFS with RDMA enabled
authorShweta Bhosale <Shweta.Bhosale1@ibm.com>
Wed, 18 Mar 2026 14:26:54 +0000 (19:56 +0530)
committerShweta Bhosale <Shweta.Bhosale1@ibm.com>
Tue, 21 Apr 2026 07:44:39 +0000 (13:14 +0530)
If RDMA is enabled and no bind address is specified, check for any RDMA-capable interface.
Otherwise, verify that the specified interface supports RDMA.

Fixes: https://tracker.ceph.com/issues/75189
Signed-off-by: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/services/nfs.py
src/pybind/mgr/cephadm/tests/services/test_nfs.py

index 3fda8c0e590701aa69c66870035731e47011ee80..fc43bb40426e783e2289ed4df8e179f0db8a3514 100644 (file)
@@ -950,6 +950,14 @@ class HostCache():
         self.networks[host] = nets
         self.last_network_update[host] = datetime_now()
 
+    def get_interface_for_ip(self, host: str, ip: str) -> Optional[str]:
+        """Return the network interface name that has the given IP on host, or None."""
+        for _subnet, ifaces in self.networks.get(host, {}).items():
+            for iface, ips in ifaces.items():
+                if ip in ips:
+                    return iface
+        return None
+
     def update_daemon_config_deps(self, host: str, name: str, deps: List[str], stamp: datetime.datetime) -> None:
         self.daemon_config_deps[host][name] = {
             'deps': deps,
index 6bfe91a148ae1a7cbe34e0d077d013c93477c072..37811a0abe68f56a25855fc8dd7547526c83373e 100644 (file)
@@ -450,6 +450,17 @@ class CephadmServe:
         self.mgr.cache.save_host(host)
         return None
 
+    async def get_rdma_devices(self, host: str) -> List[Dict[str, Any]]:
+        """Return list of RDMA devices on host from cephadm list-rdma, or [] on error."""
+        try:
+            out = await self._run_cephadm_json(
+                host, 'mon', 'list-rdma', [], no_fsid=True,
+                log_output=self.mgr.log_refresh_metadata)
+            return out if isinstance(out, list) else []
+        except OrchestratorError as e:
+            self.log.error('Failed to get RDMA devices for host %s: %s', host, e)
+            return []
+
     def _refresh_host_osdspec_previews(self, host: str) -> Optional[str]:
         self.update_osdspec_previews(host)
         self.mgr.cache.save_host(host)
index c130c8c7c08e116594053b1dc2a41c06396aba1a..9ad2057ab670adc94883792eec160101396b0a1b 100644 (file)
@@ -194,15 +194,28 @@ class NFSService(CephService):
             logger.warning(f'Bind address in {daemon_type}.{daemon_id}\'s ganesha conf is defaulting to empty')
         else:
             logger.debug("using haproxy bind address: %r", bind_addr)
-            if spec.enable_rdma:
-                logger.warning(
-                    'NFS RDMA is enabled with Bind_Addr %s on host %s. '
-                    'Ensure the network interface for this address is RDMA-capable. '
-                    "On the host, run 'rdma link show' and confirm the netdev for the interface "
-                    'with this IP is listed.',
-                    bind_addr.split('/')[0] if bind_addr else bind_addr,
-                    host,
+
+        if spec.enable_rdma:
+            from cephadm.serve import CephadmServe
+            rdma_devices = self.mgr.wait_async(
+                CephadmServe(self.mgr).get_rdma_devices(host))
+            if not rdma_devices:
+                raise OrchestratorError(
+                    f'NFS RDMA is enabled but host {host} has no RDMA devices. '
+                    "Run 'cephadm list-rdma' on the host to verify RDMA is available."
                 )
+            if bind_addr:
+                bind_ip = bind_addr.split('/')[0]
+                iface = self.mgr.cache.get_interface_for_ip(host, bind_ip)
+                if iface:
+                    rdma_netdevs = {d.get('netdev', '') for d in rdma_devices}
+                    if iface not in rdma_netdevs:
+                        raise OrchestratorError(
+                            f'NFS RDMA is enabled with bind address {bind_addr} on host {host}, '
+                            f'but interface {iface} (for this IP) is not RDMA-capable. '
+                            f'RDMA netdevs on host: {sorted(rdma_netdevs)}. '
+                            "Use an IP on an RDMA-capable interface or run 'rdma link show' on the host."
+                        )
 
         if monitoring_ip:
             daemon_spec.port_ips.update({str(monitoring_port): monitoring_ip})
@@ -290,6 +303,7 @@ class NFSService(CephService):
                 'user': rgw_user,
                 'keyring': rgw_keyring,
             }
+            config['enable_rdma'] = spec.enable_rdma
             logger.debug('Generated cephadm config-json: %s' % config)
             return config
 
index 86cc0ea8a30e47c635e7089e8e258adf59a13ae7..5b31a154405c4273a712282ec0469db98967b3c9 100644 (file)
@@ -486,14 +486,25 @@ class TestNFS:
                 )
                 assert expected_tls_block in ganesha_conf
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm_json")
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.nfs.NFSService.fence_old_ranks", MagicMock())
     @patch("cephadm.services.nfs.NFSService.run_grace_tool", MagicMock())
     @patch("cephadm.services.nfs.NFSService.purge", MagicMock())
     @patch("cephadm.services.nfs.NFSService.create_rados_config_obj", MagicMock())
-    def test_nfs_config_rdma_enabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_nfs_config_rdma_enabled(self, _run_cephadm, _run_cephadm_json, cephadm_module: CephadmOrchestrator):
         """NFS with enable_rdma=True: ganesha.conf has RDMA protocols (nfsrdma, rpcrdma)."""
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+        # Mock list-rdma only: return RDMA devices for list-rdma; [] for ls; {} for others (.get)
+
+        async def mock_list_rdma(host, entity, command, *args, **kwargs):
+            if command == 'list-rdma':
+                return [{'link': 'rdma0/1', 'state': 'ACTIVE',
+                         'physical_state': 'LINK_UP', 'netdev': 'eth0'}]
+            if command == 'ls':
+                return []
+            return {}
+        _run_cephadm_json.side_effect = mock_list_rdma
 
         with with_host(cephadm_module, 'host1', addr='1.2.3.7'):
             nfs_spec = NFSServiceSpec(
@@ -512,14 +523,25 @@ class TestNFS:
                 ganesha_conf = nfs_generated_conf['files']['ganesha.conf']
                 assert "Protocols = 3, 4, nfsrdma, rpcrdma" in ganesha_conf
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm_json")
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.nfs.NFSService.fence_old_ranks", MagicMock())
     @patch("cephadm.services.nfs.NFSService.run_grace_tool", MagicMock())
     @patch("cephadm.services.nfs.NFSService.purge", MagicMock())
     @patch("cephadm.services.nfs.NFSService.create_rados_config_obj", MagicMock())
-    def test_nfs_config_rdma_custom_port(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_nfs_config_rdma_custom_port(self, _run_cephadm, _run_cephadm_json, cephadm_module: CephadmOrchestrator):
         """NFS with enable_rdma and rdma_port: ganesha.conf has NFS_RDMA_Port."""
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+        # Mock list-rdma only: return RDMA devices for list-rdma; [] for ls; {} for others (.get)
+
+        async def mock_list_rdma(host, entity, command, *args, **kwargs):
+            if command == 'list-rdma':
+                return [{'link': 'rdma0/1', 'state': 'ACTIVE',
+                         'physical_state': 'LINK_UP', 'netdev': 'eth0'}]
+            if command == 'ls':
+                return []
+            return {}
+        _run_cephadm_json.side_effect = mock_list_rdma
 
         with with_host(cephadm_module, 'host1', addr='1.2.3.7'):
             nfs_spec = NFSServiceSpec(