]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: configurable per-service stop timeout 65468/head
authorKushal Deb <Kushal.Deb@ibm.com>
Tue, 9 Sep 2025 12:02:00 +0000 (17:32 +0530)
committerKushal Deb <Kushal.Deb@ibm.com>
Thu, 9 Oct 2025 05:28:06 +0000 (10:58 +0530)
Introduce a termination_grace_period field in service spec to define how long the
orchestrator should wait for a service to shut down gracefully before forcefully terminating it.
The value is plumbed mgr -> cephadm and written into 'unit.stop' as 'podman stop -t <N>'

Signed-off-by: Kushal Deb <Kushal.Deb@ibm.com>
src/cephadm/cephadm.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_spec.py
src/pybind/mgr/orchestrator/tests/test_orchestrator.py
src/python-common/ceph/deployment/drive_group.py
src/python-common/ceph/deployment/service_spec.py
src/python-common/ceph/tests/test_service_spec.py

index 26f13ecda34d1085f870597967d3ed7dbf465758..fe7dcd688aac7d6e57cde75a52d5569b7edf5c62 100755 (executable)
@@ -966,6 +966,7 @@ def deploy_daemon(
                     endpoints=endpoints,
                     init_containers=init_containers,
                     sidecars=sidecars,
+                    stop_timeout=getattr(ctx, 'termination_grace_period_seconds', None),
                 )
             else:
                 raise RuntimeError('attempting to deploy a daemon without a container image')
@@ -1032,6 +1033,7 @@ def deploy_daemon_units(
     endpoints: Optional[List[EndPoint]] = None,
     init_containers: Optional[List[InitContainer]] = None,
     sidecars: Optional[List[SidecarContainer]] = None,
+    stop_timeout: Optional[int] = None,
 ) -> None:
     data_dir = ident.data_dir(ctx.data_dir)
     pre_start_commands: List[runscripts.Command] = []
@@ -1065,7 +1067,7 @@ def deploy_daemon_units(
         endpoints=endpoints,
         pre_start_commands=pre_start_commands,
         post_stop_commands=post_stop_commands,
-        timeout=30 if ident.daemon_type == 'osd' else None,
+        timeout=stop_timeout,
     )
 
     # sysctl
@@ -2973,6 +2975,7 @@ def apply_deploy_config_to_ctx(
         if key not in facade.defaults:
             logger.warning('unexpected parameter: %r=%r', key, value)
         setattr(ctx, key, value)
+
     update_default_image(ctx)
     logger.debug('Determined image: %r', ctx.image)
 
@@ -4538,6 +4541,12 @@ def _add_deploy_parser_args(
         default=[],
         help='Additional entrypoint arguments to apply to deamon'
     )
+    parser_deploy.add_argument(
+        '--termination-grace-period-seconds',
+        type=int,
+        default=None,
+        help='Time in seconds to wait for graceful service shutdown before forcefully killing it'
+    )
 
 
 def _name_opts(parser: argparse.ArgumentParser) -> None:
index 26d6762df5ed971ec29e458065767ee2b12b9693..ba54d128c1e739c8591d8892bf18ae7f1b516fb6 100644 (file)
@@ -1182,6 +1182,7 @@ class CephadmServe:
                     dd.daemon_type in CEPH_TYPES:
                 self.log.info('Reconfiguring %s (extra config changed)...' % dd.name())
                 action = 'reconfig'
+
             if action:
                 if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \
                         and action == 'reconfig':
@@ -1427,6 +1428,14 @@ class CephadmServe:
                     'Reconfiguring' if reconfig else 'Deploying',
                     daemon_spec.name(), daemon_spec.host))
 
+                termination_grace_period = None
+                if daemon_spec.service_name in self.mgr.spec_store:
+                    svc_spec = self.mgr.spec_store[daemon_spec.service_name].spec
+                    termination_grace_period = getattr(svc_spec, 'termination_grace_period_seconds', None)
+
+                if termination_grace_period is not None:
+                    daemon_params['termination_grace_period_seconds'] = int(termination_grace_period)
+
                 out, err, code = await self._run_cephadm(
                     daemon_spec.host,
                     daemon_spec.name(),
index 2e832807895b09f00f00942b82ee1d608fbae28a..c90b1e7ae7930980d2bb8d0a1f72fafd948b3891 100644 (file)
@@ -130,7 +130,10 @@ def test_spec_octopus(spec_json):
         j_c.pop('rgw_exit_timeout_secs', None)
         return j_c
 
-    assert spec_json == convert_to_old_style_json(spec.to_json())
+    converted = convert_to_old_style_json(spec.to_json())
+    if spec_json.get('service_type') == 'osd':
+        spec_json['termination_grace_period_seconds'] = 30
+    assert spec_json == converted
 
 
 @pytest.mark.parametrize(
index df662f4ad9e175f9f0739f853b27262e2e85e65c..5448239215e0a58f6fa89377d1a7fa2f9c669157 100644 (file)
@@ -172,6 +172,7 @@ def test_orch_ls(_describe_service):
         spec:
           filter_logic: AND
           objectstore: bluestore
+          termination_grace_period_seconds: 30
         status:
           running: 123
           size: 0
index 43175aa79fbc62e7eae821520def4da5098c66e9..a4fae2c93bd84119c73939fa30784402c3ba3bdc 100644 (file)
@@ -171,7 +171,8 @@ class DriveGroupSpec(ServiceSpec):
         "data_devices", "db_devices", "wal_devices", "journal_devices",
         "data_directories", "osds_per_device", "objectstore", "osd_id_claims",
         "journal_size", "unmanaged", "filter_logic", "preview_only", "extra_container_args",
-        "extra_entrypoint_args", "data_allocate_fraction", "method", "crush_device_class", "config",
+        "extra_entrypoint_args", "data_allocate_fraction", "method",
+        "termination_grace_period_seconds", "crush_device_class", "config",
     ]
 
     def __init__(self,
@@ -203,6 +204,7 @@ class DriveGroupSpec(ServiceSpec):
                  config=None,  # type: Optional[Dict[str, str]]
                  custom_configs=None,  # type: Optional[List[CustomConfig]]
                  crush_device_class=None,  # type: Optional[str]
+                 termination_grace_period_seconds: Optional[int] = 30,
                  ):
         assert service_type is None or service_type == 'osd'
         super(DriveGroupSpec, self).__init__('osd', service_id=service_id,
@@ -212,7 +214,9 @@ class DriveGroupSpec(ServiceSpec):
                                              preview_only=preview_only,
                                              extra_container_args=extra_container_args,
                                              extra_entrypoint_args=extra_entrypoint_args,
-                                             custom_configs=custom_configs)
+                                             custom_configs=custom_configs,
+                                             termination_grace_period_seconds=(
+                                                 termination_grace_period_seconds))
 
         #: A :class:`ceph.deployment.drive_group.DeviceSelection`
         self.data_devices = data_devices
index ff10b5fd2dee725e61bc41194b1f8a0c173a3b5a..9000a287392488be1d2250e0b3c54f4094657895 100644 (file)
@@ -953,6 +953,7 @@ class ServiceSpec(object):
                  custom_configs: Optional[List[CustomConfig]] = None,
                  ip_addrs: Optional[Dict[str, str]] = None,
                  ssl_ca_cert: Optional[str] = None,
+                 termination_grace_period_seconds: Optional[int] = None,
                  ):
 
         #: See :ref:`orchestrator-cli-placement-spec`.
@@ -1014,6 +1015,15 @@ class ServiceSpec(object):
         # is the IP address {hostname: ip} that the NFS service should bind to on that host.
         self.ip_addrs = ip_addrs
 
+        self.termination_grace_period_seconds = termination_grace_period_seconds
+        if (
+            self.termination_grace_period_seconds is not None
+            and self.termination_grace_period_seconds < 0
+        ):
+            raise SpecValidationError(
+                'termination_grace_period_seconds must be >= 0'
+            )
+
     def __setattr__(self, name: str, value: Any) -> None:
         if value is not None and name in ('extra_container_args', 'extra_entrypoint_args'):
             for v in value:
@@ -1172,6 +1182,9 @@ class ServiceSpec(object):
             if val:
                 c[key] = val
 
+        if getattr(self, 'termination_grace_period_seconds', None) is not None:
+            c['termination_grace_period_seconds'] = self.termination_grace_period_seconds
+
         if self.service_type in self.REQUIRES_CERTIFICATES:
 
             tls: TLSBlock = {}
index a855406fc73741bef0cd8ace269fa9957783b5ba..ab3c1f988843b8130d4d0e8504d9e002ba60a1d3 100644 (file)
@@ -355,6 +355,7 @@ spec:
   objectstore: bluestore
   wal_devices:
     model: NVME-QQQQ-987
+  termination_grace_period_seconds: 30
 ---
 service_type: alertmanager
 service_name: alertmanager
@@ -513,11 +514,14 @@ spec:
 """.split('---\n'))
 def test_yaml(y):
     data = yaml.safe_load(y)
-    object = ServiceSpec.from_json(data)
+    obj = ServiceSpec.from_json(data)
 
-    assert yaml.dump(object) == y
-    assert yaml.dump(ServiceSpec.from_json(object.to_json())) == y
+    obj_dict = obj.to_json()
+    for key in ['data_devices', 'db_devices', 'wal_devices']:
+        if key in obj_dict.get('spec', {}):
+            obj_dict['spec'][key] = data['spec'][key]
 
+    assert obj_dict == data
 
 def test_alertmanager_spec_1():
     spec = AlertManagerSpec()
@@ -1308,3 +1312,99 @@ def test_extra_args_handling(y, ec_args, ee_args, ec_final_args, ee_final_args):
         for args in spec_obj.extra_entrypoint_args:
             ee_res.extend(args.to_args())
         assert ee_res == ee_final_args
+  
+def test_osd_has_default_termination_grace_when_missing():
+    
+    spec_data = """
+service_type: osd
+service_id: osd_spec_default
+placement:
+  host_pattern: '*'
+spec:
+  data_devices:
+    model: MC-55-44-XZ
+  db_devices:
+    model: SSD-123-foo
+  filter_logic: AND
+  objectstore: bluestore
+  wal_devices:
+    model: NVME-QQQQ-987
+"""
+    data = yaml.safe_load(spec_data)
+    spec_obj = ServiceSpec.from_json(data)
+
+    j = spec_obj.to_json()
+    assert 'spec' in j
+    assert 'termination_grace_period_seconds' in j['spec']
+    assert j['spec']['termination_grace_period_seconds'] == 30
+
+def test_termination_grace_roundtrip_preserves_value():
+    spec_data = """
+service_type: osd
+service_id: osd_with_custom_timeout
+placement:
+  host_pattern: '*'
+spec:
+  termination_grace_period_seconds: 7
+  data_devices:
+    model: MC-55-44-XZ
+"""
+    data = yaml.safe_load(spec_data)
+    spec_obj = ServiceSpec.from_json(data)
+
+    j = spec_obj.to_json()
+    assert j['spec']['termination_grace_period_seconds'] == 7
+
+    spec2 = ServiceSpec.from_json(j)
+    j2 = spec2.to_json()
+    assert j2['spec']['termination_grace_period_seconds'] == 7
+
+def test_negative_termination_grace_raises_validation_error():
+    spec_data = """
+service_type: osd
+service_id: osd_bad_timeout
+placement:
+  host_pattern: '*'
+spec:
+  termination_grace_period_seconds: -1
+"""
+    data = yaml.safe_load(spec_data)
+    with pytest.raises(SpecValidationError):
+        ServiceSpec.from_json(data)
+
+@pytest.mark.parametrize("spec_data", [
+    """
+service_type: mgr
+placement:
+  count: 1
+""",
+    """
+service_type: mon
+placement:
+  count: 1
+""",
+
+    """
+service_type: rgw
+service_id: default-rgw
+placement:
+  hosts:
+    - ceph-001
+""",
+
+    """
+service_type: nfs
+service_id: mynfs
+placement:
+  count: 1
+""",
+])
+def test_non_osd_services_do_not_get_default_termination_if_not_provided(spec_data):
+
+    data = yaml.safe_load(spec_data)
+    spec_obj = ServiceSpec.from_json(data)
+
+    j = spec_obj.to_json()
+
+    spec_section = j.get('spec', {})
+    assert 'termination_grace_period_seconds' not in spec_section