From: Adam King Date: Wed, 22 Jan 2025 19:23:48 +0000 (-0500) Subject: mgr/cephadm: allow setting up RGW delaying shutdown to complete client connections X-Git-Tag: v20.3.0~54^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b84bb7212fab490da811180b7e9e4f5700f08e62;p=ceph.git mgr/cephadm: allow setting up RGW delaying shutdown to complete client connections Specifically through the spec file. This was added recently on the RGW side by https://github.com/ceph/ceph/commit/575f5d461706b02a596d5ce959e5fb1aa87c1d8c This commit is to make it easier to take advantage of for users in cephadm deployments Signed-off-by: Adam King --- diff --git a/doc/cephadm/services/rgw.rst b/doc/cephadm/services/rgw.rst index c18d90d53bd..e1d3a8e5441 100644 --- a/doc/cephadm/services/rgw.rst +++ b/doc/cephadm/services/rgw.rst @@ -229,6 +229,41 @@ RGW daemons deployed for that RGW service. For example The daemon can still receive replication data unless it has been removed from the zonegroup and zone replication endpoints. +Draining client connections on shutdown +--------------------------------------- + +When an RGW daemon is stopped by for any reason, including during the cephadm upgrade process, +RGW offers a setting to delay shutdown as the RGW daemon attempts to complete ongoing +client requests. This setting is off by default but activated manually by either passing +``--stop-timeout=`` to the RGW process or by setting the +``rgw_exit_timeout_secs`` config option for the RGW daemon. This value may be configured in +the RGW service spec file by specifying the ``rgw_exit_timeout_secs`` parameter in the spec +file. For example + +.. code-block:: yaml + + service_type: rgw + service_id: foo + placement: + label: rgw + spec: + rgw_realm: myrealm + rgw_zone: myzone + rgw_zonegroup: myzg + rgw_exit_timeout_secs: 120 + +would tell the RGW daemons cephadm deploys for the rgw.foo service to wait up to 120 +seconds for current client requests to complete. Note that the RGW daemon will refuse +new client requests during this time. + +.. note:: In cephadm deployments this setting defaults to on and 120 seconds. If you would + like to disable this feature you must set ``rgw_exit_timeout_secs`` to 0 in the spec + +.. note:: Modifications to this setting in the spec will not be picked up by the RGW daemons + in the service until they are redeployed using either the ``ceph orch redeploy `` + or ``ceph orch daemon redeploy `` commands + + Service specification --------------------- diff --git a/src/cephadm/cephadmlib/daemons/ceph.py b/src/cephadm/cephadmlib/daemons/ceph.py index 40061672d06..c31a355d7eb 100644 --- a/src/cephadm/cephadmlib/daemons/ceph.py +++ b/src/cephadm/cephadmlib/daemons/ceph.py @@ -90,6 +90,10 @@ class Ceph(ContainerDaemonForm): # but that doesn't seem to persist in the object after it's passed # in further function calls ctr.args = ctr.args + ['--set-crush-location', c_loc] + if self.identity.daemon_type == 'rgw' and config_json is not None: + if 'rgw_exit_timeout_secs' in config_json: + stop_timeout = config_json['rgw_exit_timeout_secs'] + ctr.args = ctr.args + [f'--stop-timeout={stop_timeout}'] return ctr _uid_gid: Optional[Tuple[int, int]] = None diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py index d5dcac62cc9..4061e4edcb2 100644 --- a/src/cephadm/tests/test_cephadm.py +++ b/src/cephadm/tests/test_cephadm.py @@ -461,6 +461,79 @@ class TestCephAdm(object): _cephadm.command_deploy_from(ctx) _deploy_daemon.assert_called() + def test_rgw_exit_timeout(self, funkypatch): + """ + test that rgw exit timeout secs is set properly + """ + funkypatch.patch('cephadm.logger') + funkypatch.patch('cephadm.FileLock') + _deploy_daemon = funkypatch.patch('cephadm.deploy_daemon') + funkypatch.patch('cephadm.make_var_run') + funkypatch.patch('cephadmlib.file_utils.make_run_dir') + funkypatch.patch('os.mkdir') + _migrate_sysctl = funkypatch.patch('cephadm.migrate_sysctl_dir') + funkypatch.patch( + 'cephadm.check_unit', + dest=lambda *args, **kwargs: (None, 'running', None), + ) + funkypatch.patch( + 'cephadm.get_unit_name', + dest=lambda *args, **kwargs: 'mon-unit-name', + ) + funkypatch.patch( + 'cephadm.extract_uid_gid', dest=lambda *args, **kwargs: (0, 0) + ) + _get_container = funkypatch.patch('cephadm.get_container') + funkypatch.patch( + 'cephadm.apply_deploy_config_to_ctx', dest=lambda d, c: None + ) + _fetch_configs = funkypatch.patch( + 'cephadmlib.context_getters.fetch_configs' + ) + funkypatch.patch( + 'cephadm.read_configuration_source', dest=lambda c: {} + ) + funkypatch.patch('cephadm.fetch_custom_config_files') + + ctx = _cephadm.CephadmContext() + ctx.name = 'rgw.foo.test.abcdef' + ctx.fsid = 'b66e5288-d8ea-11ef-b953-525400f9646d' + ctx.reconfig = False + ctx.container_engine = mock_docker() + ctx.allow_ptrace = True + ctx.config_json = '-' + ctx.osd_fsid = '0' + ctx.tcp_ports = '3300 6789' + _fetch_configs.return_value = { + 'rgw_exit_timeout_secs': 200 + } + + _get_container.return_value = _cephadm.CephContainer.for_daemon( + ctx, + ident=_cephadm.DaemonIdentity( + fsid='b66e5288-d8ea-11ef-b953-525400f9646d', + daemon_type='rgw', + daemon_id='foo.test.abcdef', + ), + entrypoint='', + args=[], + container_args=[], + volume_mounts={}, + bind_mounts=[], + envs=[], + privileged=False, + ptrace=False, + host_network=True, + ) + + def _exit_timeout_secs_checker(ctx, ident, container, uid, gid, **kwargs): + argval = ' '.join(container.args) + assert '--stop-timeout=200' in argval + + _deploy_daemon.side_effect = _exit_timeout_secs_checker + _cephadm.command_deploy_from(ctx) + _deploy_daemon.assert_called() + @mock.patch('cephadm.logger') @mock.patch('cephadm.fetch_custom_config_files') def test_write_custom_conf_files(self, _get_config, _logger, cephadm_fs): diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 50d6b566a4f..cbc5ccd4c71 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -1304,6 +1304,10 @@ class RgwService(CephService): def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: svc_spec = cast(RGWSpec, self.mgr.spec_store[daemon_spec.service_name].spec) config, parent_deps = super().generate_config(daemon_spec) + + if hasattr(svc_spec, 'rgw_exit_timeout_secs') and svc_spec.rgw_exit_timeout_secs: + config['rgw_exit_timeout_secs'] = svc_spec.rgw_exit_timeout_secs + rgw_deps = parent_deps + self.get_dependencies(self.mgr, svc_spec) return config, rgw_deps diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 87fe08633fa..694d332f589 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -273,6 +273,7 @@ class TestCephadm(object): 'service_id': 'r.z', 'service_name': 'rgw.r.z', 'service_type': 'rgw', + 'spec': {'rgw_exit_timeout_secs': 120}, 'status': {'created': mock.ANY, 'running': 1, 'size': 1, 'ports': [80]}, } diff --git a/src/pybind/mgr/cephadm/tests/test_migration.py b/src/pybind/mgr/cephadm/tests/test_migration.py index b12dd30bf4b..939c313c63f 100644 --- a/src/pybind/mgr/cephadm/tests/test_migration.py +++ b/src/pybind/mgr/cephadm/tests/test_migration.py @@ -338,6 +338,7 @@ def test_migrate_rgw_spec(cephadm_module: CephadmOrchestrator, rgw_spec_store_en 'rgw_thread_pool_size=512'], 'rgw_frontend_port': '5000', 'rgw_frontend_type': 'beast', + 'rgw_exit_timeout_secs': 120, }} else: # in a real environment, we still expect the spec to be there, diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py index 668289cf05e..12aa92000e3 100644 --- a/src/pybind/mgr/cephadm/tests/test_spec.py +++ b/src/pybind/mgr/cephadm/tests/test_spec.py @@ -118,6 +118,7 @@ def test_spec_octopus(spec_json): j_c.pop('objectstore', None) j_c.pop('filter_logic', None) j_c.pop('anonymous_access', None) + j_c.pop('rgw_exit_timeout_secs', None) return j_c assert spec_json == convert_to_old_style_json(spec.to_json()) diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 1c1b1825f15..39a3b3541e3 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1236,6 +1236,7 @@ class RGWSpec(ServiceSpec): generate_cert: bool = False, disable_multisite_sync_traffic: Optional[bool] = None, wildcard_enabled: Optional[bool] = False, + rgw_exit_timeout_secs: int = 120, ): assert service_type == 'rgw', service_type @@ -1293,6 +1294,8 @@ class RGWSpec(ServiceSpec): self.wildcard_enabled = wildcard_enabled #: Attributes for .rgw.buckets.data pool created in rgw realm bootstrap command self.data_pool_attributes = data_pool_attributes + #: How long the RGW will wait to try and complete client requests when told to shut down + self.rgw_exit_timeout_secs = rgw_exit_timeout_secs def get_port_start(self) -> List[int]: ports = self.get_port() diff --git a/src/python-common/ceph/tests/test_service_spec.py b/src/python-common/ceph/tests/test_service_spec.py index cb5324d0b79..0c5cd313013 100644 --- a/src/python-common/ceph/tests/test_service_spec.py +++ b/src/python-common/ceph/tests/test_service_spec.py @@ -335,6 +335,7 @@ networks: - 10.0.0.0/8 - 192.168.0.0/16 spec: + rgw_exit_timeout_secs: 60 rgw_frontend_type: civetweb rgw_realm: default-rgw-realm rgw_zone: eu-central-1