]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: NFSSpec changes to apply cluster level QOS while nfs service creation
authorShweta Bhosale <Shweta.Bhosale1@ibm.com>
Wed, 16 Jul 2025 07:23:05 +0000 (12:53 +0530)
committerShweta Bhosale <Shweta.Bhosale1@ibm.com>
Mon, 27 Apr 2026 12:49:15 +0000 (18:19 +0530)
Fixes: https://tracker.ceph.com/issues/69458
Signed-off-by: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/services/cephadmservice.py
src/pybind/mgr/cephadm/services/nfs.py
src/pybind/mgr/nfs/cluster.py
src/pybind/mgr/nfs/module.py
src/python-common/ceph/deployment/service_spec.py

index 37811a0abe68f56a25855fc8dd7547526c83373e..1f1b5c2fd6f8258dfb469bb65d0c9cecfc488045 100644 (file)
@@ -936,6 +936,9 @@ class CephadmServe:
 
         hosts_altered: Set[str] = set()
 
+        if service_type == 'nfs' and self.mgr.spec_store.needs_configuration(spec.service_name()):
+            svc.pre_daemon_service_config(spec)
+
         try:
             # assign names
             for i in range(len(slots_to_add)):
index 5a25837931d258258863da779cea21ec70761c0d..9d6aacb464d710b5582af67e6ff2a6efd54d9611 100644 (file)
@@ -891,6 +891,9 @@ class CephadmService(metaclass=ABCMeta):
     def get_blocking_daemon_hosts(self, service_name: str) -> List[HostSpec]:
         return []
 
+    def pre_daemon_service_config(self, spec: ServiceSpec) -> None:
+        return
+
     def has_placement_changed(self, deps: List[str], spec: ServiceSpec) -> bool:
         return False
 
index ffc4fff849b12baab7cb0467bcfe5eedc8541169..554e530590a34c73cedfa7e820f39c68bbba6165 100644 (file)
@@ -171,9 +171,6 @@ class NFSService(CephService):
         )
         self.run_grace_tool(spec, 'add', nodeid)
 
-        # create the rados config object
-        self.create_rados_config_obj(spec)
-
         port = daemon_spec.ports[0] if daemon_spec.ports else 2049
         monitoring_ip, monitoring_port = self.get_monitoring_details(daemon_spec.service_name, host, daemon_spec)
 
@@ -308,9 +305,15 @@ class NFSService(CephService):
 
         return get_cephadm_config(), self.get_dependencies(self.mgr, spec)
 
+    def pre_daemon_service_config(self, spec: ServiceSpec) -> None:
+        nfs_spec = cast(NFSServiceSpec, spec)
+        self.config(nfs_spec)
+        self.create_rados_config_obj(nfs_spec)
+
     def create_rados_config_obj(self,
                                 spec: NFSServiceSpec,
                                 clobber: bool = False) -> None:
+        config_file_data = None
         objname = spec.rados_config_name()
         cmd = [
             'rados',
@@ -325,6 +328,7 @@ class NFSService(CephService):
             timeout=10)
         if not result.returncode and not clobber:
             logger.info('Rados config object exists: %s' % objname)
+            config_file_data = result.stdout
         else:
             logger.info('Creating rados config object: %s' % objname)
             result = subprocess.run(
@@ -336,6 +340,20 @@ class NFSService(CephService):
                     f'Unable to create rados config object {objname}: {result.stderr.decode("utf-8")}'
                 )
                 raise RuntimeError(result.stderr.decode("utf-8"))
+        if spec.cluster_qos_config:
+            # set cluster level qos config
+            from nfs.cluster import config_cluster_qos_from_dict
+            assert spec.service_id
+            update_obj = False
+            if config_file_data and 'qosconf-nfs' in config_file_data.decode('utf-8'):
+                update_obj = True
+
+            config_cluster_qos_from_dict(
+                mgr=self.mgr,
+                cluster_id=spec.service_id,
+                qos_dict=spec.cluster_qos_config,
+                update_existing_obj=update_obj
+            )
 
     def create_keyring(self, daemon_spec: CephadmDaemonDeploySpec) -> str:
         daemon_id = daemon_spec.daemon_id
index 8e0d547dac272f8d62518250c19da1e7b52df3ad..0a1cc03e30949929cbb0b06a08b82dd9c46ebe6a 100644 (file)
@@ -27,7 +27,8 @@ from .qos_conf import (
     QOS,
     QOSType,
     QOSBandwidthControl,
-    QOSOpsControl)
+    QOSOpsControl,
+    QOSParams)
 
 if TYPE_CHECKING:
     from nfs.module import Module
@@ -62,6 +63,87 @@ def create_ganesha_pool(mgr: 'MgrModule') -> None:
         log.debug("Successfully created nfs-ganesha pool %s", POOL_NAME)
 
 
+def config_cluster_qos_from_dict(
+    mgr: 'MgrModule',
+    cluster_id: str,
+    qos_dict: Dict[str, Union[str, bool, int]],
+    update_existing_obj: bool = False,
+) -> None:
+    qos_type = qos_dict.get(QOSParams.qos_type.value)
+    if not qos_type:
+        raise NFSInvalidOperation('qos_type is not specified in qos dict')
+    qos_type = QOSType[str(qos_type)]
+    enable_bw_ctrl = qos_dict.get(QOSParams.enable_bw_ctrl.value)
+    combined_bw_ctrl = qos_dict.get(QOSParams.combined_bw_ctrl.value)
+    enable_iops_ctrl = qos_dict.get(QOSParams.enable_iops_ctrl.value)
+    bw_obj = ops_obj = None
+    if enable_bw_ctrl:
+        bw_obj = QOSBandwidthControl(
+            bool(enable_bw_ctrl),
+            bool(combined_bw_ctrl),
+            export_writebw=str(qos_dict.get(QOSParams.export_writebw.value, "0")),
+            export_readbw=str(qos_dict.get(QOSParams.export_readbw.value, "0")),
+            client_writebw=str(qos_dict.get(QOSParams.client_writebw.value, "0")),
+            client_readbw=str(qos_dict.get(QOSParams.client_readbw.value, "0")),
+            export_rw_bw=str(qos_dict.get(QOSParams.export_rw_bw.value, "0")),
+            client_rw_bw=str(qos_dict.get(QOSParams.client_rw_bw.value, "0")),
+        )
+        bw_obj.qos_bandwidth_checks(qos_type)
+    if enable_iops_ctrl:
+        ops_obj = QOSOpsControl(
+            bool(enable_iops_ctrl),
+            max_export_iops=int(qos_dict.get(QOSParams.max_export_iops.value, 0)),
+            max_client_iops=int(qos_dict.get(QOSParams.max_client_iops.value, 0)),
+        )
+        ops_obj.qos_ops_checks(qos_type)
+
+    write_cluster_qos_obj(
+        mgr=mgr,
+        cluster_id=cluster_id,
+        qos_obj=None,
+        enable_qos=True,
+        qos_type=qos_type,
+        bw_obj=bw_obj,
+        ops_obj=ops_obj,
+        update_existing_obj=update_existing_obj
+    )
+
+
+def write_cluster_qos_obj(
+    mgr: 'MgrModule',
+    cluster_id: str,
+    qos_obj: Optional[QOS],
+    enable_qos: bool,
+    qos_type: Optional[QOSType] = None,
+    bw_obj: Optional[QOSBandwidthControl] = None,
+    ops_obj: Optional[QOSOpsControl] = None,
+    update_existing_obj: bool = False
+) -> None:
+    qos_obj_exists = False
+    if not qos_obj:
+        log.debug(f"Creating new QoS block for cluster {cluster_id}")
+        qos_obj = QOS(True, enable_qos, qos_type, bw_obj, ops_obj)
+    else:
+        log.debug(f"Updating existing QoS block for cluster {cluster_id}")
+        qos_obj_exists = True
+        qos_obj.enable_qos = enable_qos
+        qos_obj.qos_type = qos_type
+        if bw_obj:
+            qos_obj.bw_obj = bw_obj
+        if ops_obj:
+            qos_obj.ops_obj = ops_obj
+
+    qos_config = format_block(qos_obj.to_qos_block())
+    rados_obj = NFSRados(mgr.rados, cluster_id)
+    if not qos_obj_exists and not update_existing_obj:
+        rados_obj.write_obj(qos_config, qos_conf_obj_name(cluster_id),
+                            conf_obj_name(cluster_id))
+    else:
+        rados_obj.update_obj(qos_config, qos_conf_obj_name(cluster_id),
+                             conf_obj_name(cluster_id), should_notify=False)
+    log.debug(f"Successfully saved {cluster_id}s QOS bandwidth control config: \n {qos_config}")
+
+
 class NFSCluster:
     def __init__(self, mgr: 'Module') -> None:
         self.mgr = mgr
@@ -73,6 +155,7 @@ class NFSCluster:
             virtual_ip: Optional[str] = None,
             ingress_mode: Optional[IngressType] = None,
             port: Optional[int] = None,
+            cluster_qos_config: Optional[Dict[str, Union[str, bool, int]]] = None,
             ssl: bool = False,
             ssl_cert: Optional[str] = None,
             ssl_key: Optional[str] = None,
@@ -117,6 +200,7 @@ class NFSCluster:
                                   port=ganesha_port,
                                   virtual_ip=virtual_ip_for_ganesha,
                                   enable_haproxy_protocol=enable_haproxy_protocol,
+                                  cluster_qos_config=cluster_qos_config,
                                   ssl=ssl,
                                   ssl_cert=ssl_cert,
                                   ssl_key=ssl_key,
@@ -145,6 +229,7 @@ class NFSCluster:
             spec = NFSServiceSpec(service_type='nfs', service_id=cluster_id,
                                   placement=PlacementSpec.from_string(placement),
                                   port=port,
+                                  cluster_qos_config=cluster_qos_config,
                                   ssl=ssl,
                                   ssl_cert=ssl_cert,
                                   ssl_key=ssl_key,
@@ -178,6 +263,7 @@ class NFSCluster:
             ingress: Optional[bool] = None,
             ingress_mode: Optional[IngressType] = None,
             port: Optional[int] = None,
+            cluster_qos_config: Optional[Dict[str, Union[str, bool, int]]] = None,
             ssl: bool = False,
             ssl_cert: Optional[str] = None,
             ssl_key: Optional[str] = None,
@@ -211,9 +297,24 @@ class NFSCluster:
             self.create_empty_rados_obj(cluster_id)
 
             if cluster_id not in available_clusters(self.mgr):
-                self._call_orch_apply_nfs(cluster_id, placement, virtual_ip, ingress_mode, port,
-                                          ssl, ssl_cert, ssl_key, ssl_ca_cert, tls_ktls, tls_debug,
-                                          tls_min_version, tls_ciphers, enable_rdma, rdma_port)
+                self._call_orch_apply_nfs(
+                    cluster_id,
+                    placement,
+                    virtual_ip,
+                    ingress_mode,
+                    port,
+                    cluster_qos_config=cluster_qos_config,
+                    ssl=ssl,
+                    ssl_cert=ssl_cert,
+                    ssl_key=ssl_key,
+                    ssl_ca_cert=ssl_ca_cert,
+                    tls_ktls=tls_ktls,
+                    tls_debug=tls_debug,
+                    tls_min_version=tls_min_version,
+                    tls_ciphers=tls_ciphers,
+                    enable_rdma=enable_rdma,
+                    rdma_port=rdma_port
+                )
                 return
             raise NonFatalError(f"{cluster_id} cluster already exists")
         except Exception as e:
@@ -385,29 +486,15 @@ class NFSCluster:
                                bw_obj: Optional[QOSBandwidthControl] = None,
                                ops_obj: Optional[QOSOpsControl] = None) -> None:
         """Update cluster QOS config"""
-        qos_obj_exists = False
-        if not qos_obj:
-            log.debug(f"Creating new QoS block for cluster {cluster_id}")
-            qos_obj = QOS(True, enable_qos, qos_type, bw_obj, ops_obj)
-        else:
-            log.debug(f"Updating existing QoS block for cluster {cluster_id}")
-            qos_obj_exists = True
-            qos_obj.enable_qos = enable_qos
-            qos_obj.qos_type = qos_type
-            if bw_obj:
-                qos_obj.bw_obj = bw_obj
-            if ops_obj:
-                qos_obj.ops_obj = ops_obj
-
-        qos_config = format_block(qos_obj.to_qos_block())
-        rados_obj = self._rados(cluster_id)
-        if not qos_obj_exists:
-            rados_obj.write_obj(qos_config, qos_conf_obj_name(cluster_id),
-                                conf_obj_name(cluster_id))
-        else:
-            rados_obj.update_obj(qos_config, qos_conf_obj_name(cluster_id),
-                                 conf_obj_name(cluster_id), should_notify=False)
-        log.debug(f"Successfully saved {cluster_id}s QOS bandwidth control config: \n {qos_config}")
+        write_cluster_qos_obj(
+            mgr=self.mgr,
+            cluster_id=cluster_id,
+            qos_obj=qos_obj,
+            enable_qos=enable_qos,
+            qos_type=qos_type,
+            bw_obj=bw_obj,
+            ops_obj=ops_obj
+        )
 
     def update_cluster_qos(self,
                            cluster_id: str,
index 2a5b43532c744e734b3a17a600cbd2076e16c03f..c3c859e67047406ef51f75cadf1862caf660cf22 100644 (file)
@@ -164,10 +164,12 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
                                 rdma_port: Optional[int] = None,
                                 inbuf: Optional[str] = None) -> None:
         """Create an NFS Cluster"""
+        cluster_qos_config = None
         ssl_cert = ssl_key = ssl_ca_cert = tls_min_version = tls_ciphers = None
         ssl = tls_ktls = tls_debug = False
         if inbuf:
             config = yaml.safe_load(inbuf)
+            cluster_qos_config = config.get('cluster_qos_config')
             ssl = config.get('ssl')
             ssl_cert = config.get('ssl_cert')
             ssl_key = config.get('ssl_key')
@@ -180,6 +182,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         return self.nfs.create_nfs_cluster(cluster_id=cluster_id, placement=placement,
                                            virtual_ip=virtual_ip, ingress=ingress,
                                            ingress_mode=ingress_mode, port=port,
+                                           cluster_qos_config=cluster_qos_config,
                                            ssl=ssl,
                                            ssl_cert=ssl_cert,
                                            ssl_key=ssl_key,
index fde9e7637a91cdd0649149c08833358033f9b792..48b820320e9a1ed602bd29ba2f8acc27534163a9 100644 (file)
@@ -1375,6 +1375,7 @@ class NFSServiceSpec(ServiceSpec):
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  idmap_conf: Optional[Dict[str, Dict[str, str]]] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
+                 cluster_qos_config: Optional[Dict[str, Union[str, bool, int]]] = None,
                  ssl: bool = False,
                  ssl_cert: Optional[str] = None,
                  ssl_key: Optional[str] = None,
@@ -1412,6 +1413,7 @@ class NFSServiceSpec(ServiceSpec):
         self.enable_nlm = enable_nlm
         self.enable_rdma = enable_rdma
         self.rdma_port = rdma_port
+        self.cluster_qos_config = cluster_qos_config
 
         # colocation_ports is a list of port dicts for ADDITIONAL colocated daemons
         # The first daemon always uses port and monitoring_port from the spec
@@ -1501,6 +1503,37 @@ class NFSServiceSpec(ServiceSpec):
         # Validate colocation_ports
         self.validate_colocation_ports()
 
+        # validate qos dict
+        if self.cluster_qos_config:
+            qos_enable = self.cluster_qos_config.get('enable_qos', True)
+            enable_bw_ctrl = self.cluster_qos_config.get('enable_bw_control', False)
+            combined_bw_ctrl = self.cluster_qos_config.get('combined_rw_bw_control', False)
+            enable_ops_ctrl = self.cluster_qos_config.get('enable_iops_control', False)
+            for key in [qos_enable, enable_bw_ctrl, combined_bw_ctrl, enable_ops_ctrl]:
+                if not isinstance(key, bool):
+                    raise SpecValidationError('Invalid NFS spec: cluster_qos_config is not correct')
+            if not qos_enable or not (enable_bw_ctrl or enable_ops_ctrl):
+                # this means bandwidth or iops qos won't be enable, we don't need to set qos
+                self.cluster_qos_config = None
+                return
+
+            # Verify qos_type
+            qos_type = self.cluster_qos_config.get('qos_type')
+            valid_qos_types = ['PerShare', 'PerClient', 'PerShare_PerClient']
+            if not qos_type:
+                raise SpecValidationError('Invalid NFS spec: to set cluster-level QoS, "qos_type" must be provided.')
+            if qos_type not in valid_qos_types:
+                raise SpecValidationError(
+                    f'Invalid NFS spec: "{qos_type}" is not a valid qos_type. Valid types are: {"|".join(valid_qos_types)}.'
+                )
+
+            # Verify bandwidth and IOPS types
+            for key, value in self.cluster_qos_config.items():
+                if key.endswith('bw') and not isinstance(value, str):
+                    raise SpecValidationError(f"Invalid NFS spec: bandwidth '{key}' should be a string")
+                if key.endswith('iops') and not isinstance(value, int):
+                    raise SpecValidationError(f"Invalid NFS spec: IOPS '{key}' should be an integer")
+
         # TLS certificate validation
         if self.ssl and not self.certificate_source:
             raise SpecValidationError('If SSL is enabled, a certificate source must be provided.')