]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
cephadm: improve hw qat experience with cephadm
authorKushal Deb <Kushal.Deb@ibm.com>
Mon, 19 May 2025 12:47:06 +0000 (18:17 +0530)
committerKushal Deb <Kushal.Deb@ibm.com>
Wed, 30 Jul 2025 12:20:32 +0000 (17:50 +0530)
Signed-off-by: Kushal Deb <Kushal.Deb@ibm.com>
doc/cephadm/services/rgw.rst
src/cephadm/cephadmlib/daemons/ceph.py
src/pybind/mgr/cephadm/services/cephadmservice.py
src/python-common/ceph/deployment/service_spec.py

index d9f7d883dd0aaa8307de9aee9cf1bc754090339e..d84b5f366ed09b7a19e50e25aac9348e39052286 100644 (file)
@@ -203,6 +203,22 @@ The ``wildcard_enabled`` flag ensures that a wildcard SAN entry is included in t
 allowing access to buckets in virtual host mode. By default, this flag is disabled.
 example: wildcard SAN - (``*.s3.cephlab.com``)
 
+Cephadm ``ceph orch`` specs for RGW services now support the following optional configuration:
+
+.. code-block:: yaml
+
+  spec:
+    qat:
+      compression: hw | sw
+
+compression:
+
+``hw``: Enables hardware QAT offload (if QAT hardware and VFs are present on the node)
+
+``sw``: Enables QAT software fallback mode
+
+No other keys are currently supported in the ``qat`` block.
+
 Disabling multisite sync traffic
 --------------------------------
 
index c31a355d7eb41373f95b201dd480a4f5899ff321..1119dfebb9c354eaf5c52b8bc92c2160e72b92fa 100644 (file)
@@ -16,6 +16,7 @@ from ..constants import DEFAULT_IMAGE
 from ..context import CephadmContext
 from ..deployment_utils import to_deployment_container
 from ..exceptions import Error
+from ..call_wrappers import call_throws
 from ..file_utils import (
     make_run_dir,
     pathify,
@@ -192,10 +193,64 @@ class Ceph(ContainerDaemonForm):
         )
         mounts.update(cm)
 
+    def setup_qat_args(self, ctx: CephadmContext, args: List[str]) -> None:
+        try:
+            out, _, _ = call_throws(ctx, ['ls', '-1', '/dev/vfio/devices'])
+            devices = [d for d in out.split('\n') if d]
+
+            args.extend(
+                [
+                    '--cap-add=SYS_ADMIN',
+                    '--cap-add=SYS_PTRACE',
+                    '--cap-add=IPC_LOCK',
+                    '--security-opt',
+                    'seccomp=unconfined',
+                    '--ulimit',
+                    'memlock=209715200:209715200',
+                    '--device=/dev/qat_adf_ctl:/dev/qat_adf_ctl',
+                    '--device=/dev/vfio/vfio:/dev/vfio/vfio',
+                    '-v',
+                    '/dev:/dev',
+                    '--volume=/etc/sysconfig/qat:/etc/sysconfig/qat:ro',
+                ]
+            )
+
+            for dev in devices:
+                args.append(
+                    f'--device=/dev/vfio/devices/{dev}:/dev/vfio/devices/{dev}'
+                )
+
+            os.makedirs('/etc/sysconfig', exist_ok=True)
+            with open('/etc/sysconfig/qat', 'w') as f:
+                f.write('ServicesEnabled=dc\nPOLICY=8\nQAT_USER=ceph\n')
+
+            logger.info(
+                f'[QAT] Successfully injected container args for {self.identity.daemon_name}'
+            )
+        except RuntimeError:
+            logger.exception('[QAT] Could not list /dev/vfio/devices')
+            devices = []
+
     def customize_container_args(
         self, ctx: CephadmContext, args: List[str]
     ) -> None:
         args.append(ctx.container_engine.unlimited_pids_option)
+        config_json = fetch_configs(ctx)
+        qat_raw: Any = config_json.get('qat', {})
+        if qat_raw is None:
+            qat_config: Dict[str, Any] = {}
+        elif isinstance(qat_raw, dict):
+            qat_config = qat_raw
+        else:
+            raise Error(
+                f'Invalid qat config: expected dict got {type(qat_raw.__name__)}'
+            )
+
+        if (
+            self.identity.daemon_type == 'rgw'
+            and qat_config.get('compression') == 'hw'
+        ):
+            self.setup_qat_args(ctx, args)
 
     def customize_process_args(
         self, ctx: CephadmContext, args: List[str]
index 9e618d42866d73a520d486b098f85a7d3ba819ca..f47a01ff81c08224de17d03208bcfdbf433948c9 100644 (file)
@@ -1237,6 +1237,15 @@ class RgwService(CephService):
                 'value': 'false' if spec.disable_multisite_sync_traffic else 'true',
             })
 
+        qat_mode = spec.qat.get('compression') if spec.qat else None
+        if qat_mode in ('sw', 'hw'):
+            ret, out, err = self.mgr.check_mon_command({
+                'prefix': 'config set',
+                'who': daemon_name,
+                'name': 'qat_compressor_enabled',
+                'value': 'true',
+            })
+
         daemon_spec.keyring = keyring
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
 
@@ -1273,6 +1282,11 @@ class RgwService(CephService):
             'who': utils.name_to_config_section(daemon.name()),
             'name': 'rgw_frontends',
         })
+        self.mgr.check_mon_command({
+            'prefix': 'config rm',
+            'who': utils.name_to_config_section(daemon.name()),
+            'name': 'qat_compressor_enabled'
+        })
         self.mgr.check_mon_command({
             'prefix': 'config-key rm',
             'key': f'rgw/cert/{daemon.name()}',
@@ -1323,6 +1337,9 @@ class RgwService(CephService):
         if hasattr(svc_spec, 'rgw_exit_timeout_secs') and svc_spec.rgw_exit_timeout_secs:
             config['rgw_exit_timeout_secs'] = svc_spec.rgw_exit_timeout_secs
 
+        if svc_spec.qat:
+            config['qat'] = svc_spec.qat
+
         rgw_deps = parent_deps + self.get_dependencies(self.mgr, svc_spec)
         return config, rgw_deps
 
@@ -1454,6 +1471,7 @@ class CephExporterService(CephService):
                 'ceph-exporter.crt': crt,
                 'ceph-exporter.key': key
             }
+
         daemon_spec.keyring = keyring
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
         daemon_spec.final_config = merge_dicts(daemon_spec.final_config, exporter_config)
index b2a089f50c00ec35dcff2ce80a11164318e7e368..85df4c7d9b282d4d56449d2f96b379bc92755209 100644 (file)
@@ -1238,6 +1238,7 @@ class RGWSpec(ServiceSpec):
                  disable_multisite_sync_traffic: Optional[bool] = None,
                  wildcard_enabled: Optional[bool] = False,
                  rgw_exit_timeout_secs: int = 120,
+                 qat: Optional[Dict[str, str]] = None,
                  ):
         assert service_type == 'rgw', service_type
 
@@ -1300,6 +1301,8 @@ class RGWSpec(ServiceSpec):
         #: How long the RGW will wait to try and complete client requests when told to shut down
         self.rgw_exit_timeout_secs = rgw_exit_timeout_secs
 
+        self.qat = qat or {}
+
     def get_port_start(self) -> List[int]:
         ports = self.get_port()
         return ports
@@ -1361,6 +1364,14 @@ class RGWSpec(ServiceSpec):
                         'ec profile will be generated automatically based on provided attributes'
                     )
 
+        valid_compression_modes = ('sw', 'hw')
+        if self.qat:
+            compression = self.qat.get('compression')
+            if compression and compression not in valid_compression_modes:
+                raise SpecValidationError(
+                    f"Invalid compression mode {compression}. Only 'sw' and 'hw' are allowed"
+                    )
+
 
 yaml.add_representer(RGWSpec, ServiceSpec.yaml_representer)