]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: plumb force_delete_data through daemon/service removal 68428/head
authorRedouane Kachach <rkachach@ibm.com>
Wed, 3 Dec 2025 08:28:08 +0000 (09:28 +0100)
committerKobi Ginon <kginon@redhat.com>
Mon, 27 Apr 2026 11:00:00 +0000 (14:00 +0300)
This PR wires the `force_delete_data` already existing flag in the
binary through cephadm’s daemon and service removal paths, so that
commands such as `ceph orch rm service` or equivalent daemon removal
can explicitly ask for data deletion instead of the default "move
under <fsid>/removed/" for daemons such as Prometheus, osd and mon.

Fixes: https://tracker.ceph.com/issues/74058
Signed-off-by: Kobi Ginon <kginon@redhat.com>
doc/cephadm/services/index.rst
doc/cephadm/services/monitoring.rst
doc/mgr/orchestrator.rst
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/module.py

index dfe737f83d2030022e09ace638603d303a435bda..055662351281797710c1af441a533ab21c64289f 100644 (file)
@@ -804,6 +804,13 @@ For example:
 
     ceph orch rm rgw.myrgw
 
+The same command accepts ``--force`` and ``--force-delete-data``. Use
+``ceph orch rm <service-name> --force --force-delete-data`` when you want
+cephadm to remove on-disk data for supported daemon types instead of relocating
+it under ``<fsid>/removed/``. The latter flag requires ``--force``. The same
+pair of flags applies to ``ceph orch daemon rm <daemon-name>`` when removing
+individual daemons.
+
 
 .. _cephadm-spec-unmanaged:
 
index d89818cf9e1b17cf3456eced62564383a906a078..8552e2316bef40dabefb61b5a5da2af1cb4e37b7 100644 (file)
@@ -463,11 +463,16 @@ To disable monitoring and remove the software that supports it, run the followin
 .. prompt:: bash #
 
   ceph orch rm grafana
-  ceph orch rm prometheus --force   # this will delete metrics data collected so far
+  ceph orch rm prometheus
   ceph orch rm node-exporter
   ceph orch rm alertmanager
   ceph mgr module disable prometheus
 
+By default, cephadm moves Prometheus data aside under ``<fsid>/removed/`` on the
+host. To delete that data instead, use
+``ceph orch rm prometheus --force --force-delete-data`` in place of
+``ceph orch rm prometheus`` (``--force-delete-data`` requires ``--force``).
+
 See also :ref:`orch-rm`.
 
 Setting up RBD-Image Monitoring
index c3fd5bee405272c0503f4e61cd34edab604d0998..154015c1ff74ca54fc40bd12613f634acdfe99a4 100644 (file)
@@ -113,10 +113,19 @@ Creating/growing/shrinking/removing services:
    ceph orch apply mds <fs_name> [--placement=<placement>] [--dry-run]
    ceph orch apply rgw <name> [--realm=<realm>] [--zone=<zone>] [--port=<port>] [--ssl] [--placement=<placement>] [--dry-run]
    ceph orch apply nfs <name> <pool> [--namespace=<namespace>] [--placement=<placement>] [--dry-run]
-   ceph orch rm <service_name> [--force]
+   ceph orch rm <service_name> [--force] [--force-delete-data]
 
 where ``placement`` is a :ref:`orchestrator-cli-placement-spec`.
 
+For ``ceph orch rm``, ``--force`` may be required in some cases (for example when
+removing an OSD service that would leave OSDs behind).
+
+The ``--force-delete-data`` flag requires that ``--force`` is also passed.
+This directs cephadm to delete on-disk data for certain daemon types instead
+of moving it to ``<fsid>/removed/`` on the host. These daemon types include
+``mon``, ``osd``, and ``prometheus``. This helps avoid filling up the
+underlying filesystem over time.
+
 e.g., ``ceph orch apply mds myfs --placement="3 host1 host2 host3"``
 
 Service Commands:
index aed6ba03efadb406ce9fc42a08864007bed7a1d2..cdb75d2b75f14c0ea5d25969090e84d87d2d8dc6 100644 (file)
@@ -239,7 +239,7 @@ class SpecDescription(NamedTuple):
     spec: ServiceSpec
     rank_map: Optional[Dict[int, Dict[int, Optional[str]]]]
     created: datetime.datetime
-    deleted: Optional[datetime.datetime]
+    deleted: Optional[Tuple[datetime.datetime, bool]]
 
 
 class SpecStore():
@@ -250,7 +250,7 @@ class SpecStore():
         # service_name -> rank -> gen -> daemon_id
         self._rank_maps = {}    # type: Dict[str, Dict[int, Dict[int, Optional[str]]]]
         self.spec_created = {}  # type: Dict[str, datetime.datetime]
-        self.spec_deleted = {}  # type: Dict[str, datetime.datetime]
+        self.spec_deleted = {}  # type: Dict[str, Tuple[datetime.datetime, bool]]
         self.spec_preview = {}  # type: Dict[str, ServiceSpec]
         self._needs_configuration: Dict[str, bool] = {}
 
@@ -315,8 +315,11 @@ class SpecStore():
                 self.spec_created[service_name] = created
 
                 if 'deleted' in j:
-                    deleted = str_to_datetime(cast(str, j['deleted']))
-                    self.spec_deleted[service_name] = deleted
+                    deleted_ts = str_to_datetime(cast(str, j['deleted']))
+                    force_delete_data = cast(
+                        bool, j.get('force_delete_data', False)
+                    )
+                    self.spec_deleted[service_name] = (deleted_ts, force_delete_data)
 
                 if 'needs_configuration' in j:
                     self._needs_configuration[service_name] = cast(bool, j['needs_configuration'])
@@ -378,7 +381,9 @@ class SpecStore():
         if name in self._rank_maps:
             data['rank_map'] = self._rank_maps[name]
         if name in self.spec_deleted:
-            data['deleted'] = datetime_to_str(self.spec_deleted[name])
+            deleted_time, force_delete_data = self.spec_deleted[name]
+            data['deleted'] = datetime_to_str(deleted_time)
+            data['force_delete_data'] = force_delete_data
         if name in self._needs_configuration:
             data['needs_configuration'] = self._needs_configuration[name]
 
@@ -469,7 +474,7 @@ class SpecStore():
                         service_name=nvmeof_spec.service_name(),
                         user_made=True)
 
-    def rm(self, service_name: str) -> bool:
+    def rm(self, service_name: str, force_delete_data: bool = False) -> bool:
         if service_name not in self._specs:
             return False
 
@@ -477,7 +482,7 @@ class SpecStore():
             self.finally_rm(service_name)
             return True
 
-        self.spec_deleted[service_name] = datetime_now()
+        self.spec_deleted[service_name] = (datetime_now(), force_delete_data)
         self.save(self._specs[service_name], update_create=False)
         return True
 
index e6704e8a023c2d4dd194164cf0f335c9f26c31bc..e631d71b62b61c40fa2c9adaa36e182e537333d0 100644 (file)
@@ -2494,13 +2494,14 @@ Then run the following:
             else:
                 size = spec.placement.get_target_count(self.cache.get_schedulable_hosts())
 
+            deleted_ts = self.spec_store.spec_deleted.get(nm)
             svc_desc = orchestrator.ServiceDescription(
                 spec=spec,
                 size=size,
                 running=0,
                 events=self.events.get_for_service(spec.service_name()),
                 created=self.spec_store.spec_created[nm],
-                deleted=self.spec_store.spec_deleted.get(nm, None),
+                deleted=deleted_ts[0] if deleted_ts else None,
                 virtual_ip=spec.get_virtual_ip(),
                 ports=spec.get_port_start(),
             )
@@ -2786,28 +2787,61 @@ Then run the following:
         return msg
 
     @handle_orch_error
-    def remove_daemons(self, names):
-        # type: (List[str]) -> List[str]
+    def remove_daemons(self,
+                       names: List[str],
+                       force_delete_data: bool = False) -> List[str]:
+        """
+        Remove specific daemon(s).
+
+        :param names: daemon names to remove
+        :param force: skip safety checks (PRECIOUS DATA warning)
+        :param force_delete_data: if True, request that cephadm delete the
+                                  daemon data instead of moving it under
+                                  <fsid>/removed/.
+        """
         args = []
         for host, dm in self.cache.daemons.items():
             for name in names:
                 if name in dm:
-                    args.append((name, host))
+                    args.append((name, host, force_delete_data))
         if not args:
             raise OrchestratorError('Unable to find daemon(s) %s' % (names), errno=errno.EINVAL)
         self.log.info('Remove daemons %s' % ' '.join([a[0] for a in args]))
         return self._remove_daemons(args)
 
     @handle_orch_error
-    def remove_service(self, service_name: str, force: bool = False) -> str:
-        self.log.info('Remove service %s' % service_name)
+    def remove_service(
+        self,
+        service_name: str,
+        force: bool = False,
+        force_delete_data: bool = False
+    ) -> str:
+        """
+        Remove a service.
+
+        :param service_name: service to remove
+        :param force: skip safety checks (e.g., leftover OSDs)
+        :param force_delete_data: intent to delete backing daemon data instead
+                                  of moving it under <fsid>/removed/.
+                                  (actual effect depends on lower layers)
+        """
+        self.log.info(
+            'Remove service %s (force=%s, force_delete_data=%s)' %
+            (service_name, force, force_delete_data)
+        )
         self._trigger_preview_refresh(service_name=service_name)
         if service_name in self.spec_store:
             if self.spec_store[service_name].spec.service_type in ('mon', 'mgr'):
-                return f'Unable to remove {service_name} service.\n' \
-                       f'Note, you might want to mark the {service_name} service as "unmanaged"'
+                return (
+                    f'Unable to remove {service_name} service.\n'
+                    f'Note, you might want to mark the {service_name} '
+                    f'service as "unmanaged"'
+                )
         else:
-            return f"Invalid service '{service_name}'. Use 'ceph orch ls' to list available services.\n"
+            return (
+                f"Invalid service '{service_name}'. Use 'ceph orch ls' to "
+                f"list available services.\n"
+            )
 
         # Report list of affected OSDs?
         if not force and service_name.startswith('osd.'):
@@ -2824,9 +2858,11 @@ Then run the following:
                 for h, ls in osds_msg.items():
                     msg += f'\thost {h}: {" ".join([f"osd.{id}" for id in ls])}'
                 raise OrchestratorError(
-                    f'If {service_name} is removed then the following OSDs will remain, --force to proceed anyway\n{msg}')
+                    f'If {service_name} is removed then the following OSDs '
+                    f'will remain, --force to proceed anyway\n{msg}'
+                )
 
-        found = self.spec_store.rm(service_name)
+        found = self.spec_store.rm(service_name, force_delete_data)
         if found and service_name.startswith('osd.'):
             self.spec_store.finally_rm(service_name)
         self._kick_serve_loop()
@@ -3244,8 +3280,10 @@ Then run the following:
         return previews_for_specs
 
     @forall_hosts
-    def _remove_daemons(self, name: str, host: str) -> str:
-        return CephadmServe(self)._remove_daemon(name, host)
+    def _remove_daemons(self, name: str, host: str, force_delete_data: bool = False) -> str:
+        # pass force_delete_data by keyword: third positional arg is no_post_remove
+        return CephadmServe(self)._remove_daemon(
+            name, host, force_delete_data=force_delete_data)
 
     def _check_pool_exists(self, pool: str, service_name: str) -> None:
         logger.info(f'Checking pool "{pool}" exists for service {service_name}')
index c98321e8111b959eb5a5597c169df4cd07743bc4..75e7a8aa5d40b0f9d1e22c88241a8cb9bfb051c5 100644 (file)
@@ -1116,15 +1116,16 @@ class CephadmServe:
             if not spec and dd.daemon_type not in ['mon', 'mgr', 'osd']:
                 # (mon and mgr specs should always exist; osds aren't matched
                 # to a service spec)
+                force_delete_data = False
+                if dd.service_name() in self.mgr.spec_store.spec_deleted:
+                    _, force_delete_data = self.mgr.spec_store.spec_deleted[dd.service_name()]
                 self.log.info('Removing orphan daemon %s...' % dd.name())
-                self._remove_daemon(dd.name(), dd.hostname)
-
-            # ignore unmanaged services
-            if spec and spec.unmanaged:
+                self._remove_daemon(dd.name(), dd.hostname, force_delete_data=force_delete_data)
+                # This daemon was removed from cache; skip any additional checks
+                # in this iteration to avoid looking up stale daemon entries.
                 continue
 
-            # ignore daemons for deleted services
-            if dd.service_name() in self.mgr.spec_store.spec_deleted:
+            if spec and spec.unmanaged:
                 continue
 
             if dd.daemon_type == 'agent':
@@ -1593,7 +1594,7 @@ class CephadmServe:
             ic_params.append(ic.to_json(flatten_args=True))
         return ic_meta
 
-    def _remove_daemon(self, name: str, host: str, no_post_remove: bool = False) -> str:
+    def _remove_daemon(self, name: str, host: str, no_post_remove: bool = False, force_delete_data: bool = False) -> str:
         """
         Remove a daemon
         """
@@ -1612,10 +1613,11 @@ class CephadmServe:
             service_registry.get_service(daemon_type_to_service(daemon_type)).pre_remove(daemon)
             # NOTE: we are passing the 'force' flag here, which means
             # we can delete a mon instances data.
+            args = ['--name', name, '--force']
+            if force_delete_data:
+                args.append('--force-delete-data')
             if dd.ports:
-                args = ['--name', name, '--force', '--tcp-ports', ' '.join(map(str, dd.ports))]
-            else:
-                args = ['--name', name, '--force']
+                args.extend(['--tcp-ports', ' '.join(map(str, dd.ports))])
 
             self.log.info('Removing daemon %s from %s -- ports %s' % (name, host, dd.ports))
             with self.mgr.async_timeout_handler(host, f'cephadm rm-daemon (daemon {name})'):
index 264919cd0f95909c2ad220d6c6eb5257b354912a..2eb16c85cb9094331e2e3e434f86d601ed21bc25 100644 (file)
@@ -1597,6 +1597,50 @@ class TestCephadm(object):
             out = wait(cephadm_module, c)
             assert out == ["Removed rgw.myrgw.myhost.myid from host 'test'"]
 
+    @pytest.mark.parametrize(
+        "daemon_name",
+        ["osd.424242", "prometheus.force_delete_data_regression"]
+    )
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    def test_remove_daemon_force_delete_data_passes_cephadm_flag(self, _run_cephadm, cephadm_module, daemon_name):
+        """Regression: force_delete_data must be passed as a keyword to _remove_daemon.
+
+        Otherwise the third positional arg is bound to no_post_remove and cephadm
+        never receives --force-delete-data (see _remove_daemon parameter order).
+        """
+        ls_json = json.dumps([
+            dict(
+                name=daemon_name,
+                style='cephadm',
+                fsid='fsid',
+                container_id='container_id',
+                version='version',
+                state='running',
+            )
+        ])
+
+        async def side_effect(host, entity, command, args, **kwargs):
+            if command == 'ls':
+                return [ls_json], '', 0
+            return ['{}'], '', 0
+
+        _run_cephadm.side_effect = side_effect
+
+        with with_host(cephadm_module, 'test'):
+            CephadmServe(cephadm_module)._refresh_host_daemons('test')
+            c = cephadm_module.list_daemons()
+            wait(cephadm_module, c)
+            c = cephadm_module.remove_daemons([daemon_name], force_delete_data=True)
+            wait(cephadm_module, c)
+
+        rm_daemon_calls = [
+            call for call in _run_cephadm.call_args_list
+            if len(call[0]) >= 4 and call[0][2] == 'rm-daemon'
+        ]
+        assert len(rm_daemon_calls) == 1
+        rm_args = rm_daemon_calls[0][0][3]
+        assert '--force-delete-data' in rm_args
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
     def test_remove_duplicate_osds(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
index 136fde595ac05abde23e2a7785510544f07577e6..a230594d60f37ad2f9967d7fe146dda5903390cf 100644 (file)
@@ -662,7 +662,7 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
-    def remove_daemons(self, names: List[str]) -> OrchResult[List[str]]:
+    def remove_daemons(self, names: List[str], force_delete_data: bool = False) -> OrchResult[List[str]]:
         """
         Remove specific daemon(s).
 
@@ -670,7 +670,7 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
-    def remove_service(self, service_name: str, force: bool = False) -> OrchResult[str]:
+    def remove_service(self, service_name: str, force: bool = False, force_delete_data: bool = False) -> OrchResult[str]:
         """
         Remove a service (a collection of daemons).
 
index 4b4489237e87f414a966a57e3d9d9e94e03ac76c..7b559370c33d0861e501c9f8793b9ce10d8cc153 100644 (file)
@@ -1802,25 +1802,67 @@ Usage:
     @OrchestratorCLICommand.Write('orch daemon rm')
     def _daemon_rm(self,
                    names: List[str],
-                   force: Optional[bool] = False) -> HandleCommandResult:
-        """Remove specific daemon(s)"""
+                   force: bool = False,
+                   force_delete_data: bool = False) -> HandleCommandResult:
+        """
+        Remove specific daemon(s).
+
+        When used with --force-delete-data, data for certain daemon types
+        (mon, osd, prometheus) will be deleted instead of being moved to
+        <fsid>/removed/.
+        """
         for name in names:
             if '.' not in name:
-                return HandleCommandResult(stderr=f"{name} is not a valid daemon name", retval=-errno.EINVAL)
-            (daemon_type) = name.split('.')[0]
+                return HandleCommandResult(
+                    stderr=f"{name} is not a valid daemon name",
+                    retval=-errno.EINVAL
+                )
+
+            daemon_type = name.split('.')[0]
+
             if not force and daemon_type in ['osd', 'mon', 'prometheus']:
-                return HandleCommandResult(stderr=f"must pass --force to REMOVE daemon with potentially PRECIOUS DATA for {name}", retval=-errno.EPERM)
-        completion = self.remove_daemons(names)
+                return HandleCommandResult(
+                    stderr=f"must pass --force to remove daemon with potentially precious data for {name}",
+                    retval=-errno.EPERM
+                )
+
+        if force_delete_data and not force:
+            # extra safety: don’t allow delete-data without force
+            return HandleCommandResult(
+                stderr="--force-delete-data requires --force",
+                retval=-errno.EPERM
+            )
+
+        completion = self.remove_daemons(
+            names,
+            force_delete_data=force_delete_data,
+        )
         return completion_to_result(completion)
 
     @OrchestratorCLICommand.Write('orch rm')
     def _service_rm(self,
                     service_name: str,
-                    force: bool = False) -> HandleCommandResult:
-        """Remove a service"""
+                    force: bool = False,
+                    force_delete_data: bool = False) -> HandleCommandResult:
+        """
+        Remove a service.
+
+        When used with --force-delete-data, data for stateful daemons belonging
+        to this service (e.g. mon, osd, prometheus) will be deleted instead of
+        being moved to <fsid>/removed/.
+        """
         if service_name in ['mon', 'mgr'] and not force:
             raise OrchestratorError('The mon and mgr services cannot be removed')
-        completion = self.remove_service(service_name, force=force)
+
+        if force_delete_data and not force:
+            # same safety rule as for daemon rm
+            raise OrchestratorError('--force-delete-data requires --force')
+
+        completion = self.remove_service(
+            service_name,
+            force=force,
+            force_delete_data=force_delete_data,
+        )
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())