From: Kushal Deb Date: Mon, 19 May 2025 12:47:06 +0000 (+0530) Subject: cephadm: improve hw qat experience with cephadm X-Git-Tag: testing/wip-vshankar-testing-20250910.085256-debug~4^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=6ca2848006b414b6907befd970c218cc45172283;p=ceph-ci.git cephadm: improve hw qat experience with cephadm Signed-off-by: Kushal Deb --- diff --git a/doc/cephadm/services/rgw.rst b/doc/cephadm/services/rgw.rst index d9f7d883dd0..d84b5f366ed 100644 --- a/doc/cephadm/services/rgw.rst +++ b/doc/cephadm/services/rgw.rst @@ -203,6 +203,22 @@ The ``wildcard_enabled`` flag ensures that a wildcard SAN entry is included in t allowing access to buckets in virtual host mode. By default, this flag is disabled. example: wildcard SAN - (``*.s3.cephlab.com``) +Cephadm ``ceph orch`` specs for RGW services now support the following optional configuration: + +.. code-block:: yaml + + spec: + qat: + compression: hw | sw + +compression: + +``hw``: Enables hardware QAT offload (if QAT hardware and VFs are present on the node) + +``sw``: Enables QAT software fallback mode + +No other keys are currently supported in the ``qat`` block. + Disabling multisite sync traffic -------------------------------- diff --git a/src/cephadm/cephadmlib/daemons/ceph.py b/src/cephadm/cephadmlib/daemons/ceph.py index c31a355d7eb..1119dfebb9c 100644 --- a/src/cephadm/cephadmlib/daemons/ceph.py +++ b/src/cephadm/cephadmlib/daemons/ceph.py @@ -16,6 +16,7 @@ from ..constants import DEFAULT_IMAGE from ..context import CephadmContext from ..deployment_utils import to_deployment_container from ..exceptions import Error +from ..call_wrappers import call_throws from ..file_utils import ( make_run_dir, pathify, @@ -192,10 +193,64 @@ class Ceph(ContainerDaemonForm): ) mounts.update(cm) + def setup_qat_args(self, ctx: CephadmContext, args: List[str]) -> None: + try: + out, _, _ = call_throws(ctx, ['ls', '-1', '/dev/vfio/devices']) + devices = [d for d in out.split('\n') if d] + + args.extend( + [ + '--cap-add=SYS_ADMIN', + '--cap-add=SYS_PTRACE', + '--cap-add=IPC_LOCK', + '--security-opt', + 'seccomp=unconfined', + '--ulimit', + 'memlock=209715200:209715200', + '--device=/dev/qat_adf_ctl:/dev/qat_adf_ctl', + '--device=/dev/vfio/vfio:/dev/vfio/vfio', + '-v', + '/dev:/dev', + '--volume=/etc/sysconfig/qat:/etc/sysconfig/qat:ro', + ] + ) + + for dev in devices: + args.append( + f'--device=/dev/vfio/devices/{dev}:/dev/vfio/devices/{dev}' + ) + + os.makedirs('/etc/sysconfig', exist_ok=True) + with open('/etc/sysconfig/qat', 'w') as f: + f.write('ServicesEnabled=dc\nPOLICY=8\nQAT_USER=ceph\n') + + logger.info( + f'[QAT] Successfully injected container args for {self.identity.daemon_name}' + ) + except RuntimeError: + logger.exception('[QAT] Could not list /dev/vfio/devices') + devices = [] + def customize_container_args( self, ctx: CephadmContext, args: List[str] ) -> None: args.append(ctx.container_engine.unlimited_pids_option) + config_json = fetch_configs(ctx) + qat_raw: Any = config_json.get('qat', {}) + if qat_raw is None: + qat_config: Dict[str, Any] = {} + elif isinstance(qat_raw, dict): + qat_config = qat_raw + else: + raise Error( + f'Invalid qat config: expected dict got {type(qat_raw.__name__)}' + ) + + if ( + self.identity.daemon_type == 'rgw' + and qat_config.get('compression') == 'hw' + ): + self.setup_qat_args(ctx, args) def customize_process_args( self, ctx: CephadmContext, args: List[str] diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 9e618d42866..f47a01ff81c 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -1237,6 +1237,15 @@ class RgwService(CephService): 'value': 'false' if spec.disable_multisite_sync_traffic else 'true', }) + qat_mode = spec.qat.get('compression') if spec.qat else None + if qat_mode in ('sw', 'hw'): + ret, out, err = self.mgr.check_mon_command({ + 'prefix': 'config set', + 'who': daemon_name, + 'name': 'qat_compressor_enabled', + 'value': 'true', + }) + daemon_spec.keyring = keyring daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) @@ -1273,6 +1282,11 @@ class RgwService(CephService): 'who': utils.name_to_config_section(daemon.name()), 'name': 'rgw_frontends', }) + self.mgr.check_mon_command({ + 'prefix': 'config rm', + 'who': utils.name_to_config_section(daemon.name()), + 'name': 'qat_compressor_enabled' + }) self.mgr.check_mon_command({ 'prefix': 'config-key rm', 'key': f'rgw/cert/{daemon.name()}', @@ -1323,6 +1337,9 @@ class RgwService(CephService): if hasattr(svc_spec, 'rgw_exit_timeout_secs') and svc_spec.rgw_exit_timeout_secs: config['rgw_exit_timeout_secs'] = svc_spec.rgw_exit_timeout_secs + if svc_spec.qat: + config['qat'] = svc_spec.qat + rgw_deps = parent_deps + self.get_dependencies(self.mgr, svc_spec) return config, rgw_deps @@ -1454,6 +1471,7 @@ class CephExporterService(CephService): 'ceph-exporter.crt': crt, 'ceph-exporter.key': key } + daemon_spec.keyring = keyring daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) daemon_spec.final_config = merge_dicts(daemon_spec.final_config, exporter_config) diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index b2a089f50c0..85df4c7d9b2 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1238,6 +1238,7 @@ class RGWSpec(ServiceSpec): disable_multisite_sync_traffic: Optional[bool] = None, wildcard_enabled: Optional[bool] = False, rgw_exit_timeout_secs: int = 120, + qat: Optional[Dict[str, str]] = None, ): assert service_type == 'rgw', service_type @@ -1300,6 +1301,8 @@ class RGWSpec(ServiceSpec): #: How long the RGW will wait to try and complete client requests when told to shut down self.rgw_exit_timeout_secs = rgw_exit_timeout_secs + self.qat = qat or {} + def get_port_start(self) -> List[int]: ports = self.get_port() return ports @@ -1361,6 +1364,14 @@ class RGWSpec(ServiceSpec): 'ec profile will be generated automatically based on provided attributes' ) + valid_compression_modes = ('sw', 'hw') + if self.qat: + compression = self.qat.get('compression') + if compression and compression not in valid_compression_modes: + raise SpecValidationError( + f"Invalid compression mode {compression}. Only 'sw' and 'hw' are allowed" + ) + yaml.add_representer(RGWSpec, ServiceSpec.yaml_representer)