From fa0f62aa57755c45c713367620dc834530276b25 Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 29 Sep 2023 14:39:10 -0400 Subject: [PATCH] mgr/cephadm: add --rm-crush-entry flag to host removal This will tell cephadm to try and remove the crush bucket for the host at the end of the host removal process. If this fails, we still consider the host as having been successfully remove from cephadm's POV, but the user will get back an error message telling them we failed to remove the host from the crush map Fixes: https://tracker.ceph.com/issues/63031 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 21 +++++++++++++++++++-- src/pybind/mgr/orchestrator/_interface.py | 2 +- src/pybind/mgr/orchestrator/module.py | 4 ++-- src/pybind/mgr/test_orchestrator/module.py | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index ac6e39c469e..a83256d0bb7 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -39,7 +39,13 @@ from cephadm.http_server import CephadmHttpServer from cephadm.agent import CephadmAgentHelpers -from mgr_module import MgrModule, HandleCommandResult, Option, NotifyType +from mgr_module import ( + MgrModule, + HandleCommandResult, + Option, + NotifyType, + MonCommandFailed, +) from mgr_util import build_url import orchestrator from orchestrator.module import to_format, Format @@ -1630,7 +1636,7 @@ Then run the following: return self._add_host(spec) @handle_orch_error - def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str: + def remove_host(self, host: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> str: """ Remove a host from orchestrator management. @@ -1710,6 +1716,17 @@ Then run the following: } run_cmd(cmd_args) + if rm_crush_entry: + try: + self.check_mon_command({ + 'prefix': 'osd crush remove', + 'name': host, + }) + except MonCommandFailed as e: + self.log.error(f'Couldn\'t remove host {host} from CRUSH map: {str(e)}') + return (f'Cephadm failed removing host {host}\n' + f'Failed to remove host {host} from the CRUSH map: {str(e)}') + self.inventory.rm_host(host) self.cache.rm_host(host) self.ssh.reset_con(host) diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 2c777628002..5bde317d19e 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -359,7 +359,7 @@ class Orchestrator(object): """ raise NotImplementedError() - def remove_host(self, host: str, force: bool, offline: bool) -> OrchResult[str]: + def remove_host(self, host: str, force: bool, offline: bool, rm_crush_entry: bool) -> OrchResult[str]: """ Remove a host from the orchestrator inventory. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index de4777e0def..d6f36e81b71 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -488,9 +488,9 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, return self._apply_misc([s], False, Format.plain) @_cli_write_command('orch host rm') - def _remove_host(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult: + def _remove_host(self, hostname: str, force: bool = False, offline: bool = False, rm_crush_entry: bool = False) -> HandleCommandResult: """Remove a host""" - completion = self.remove_host(hostname, force, offline) + completion = self.remove_host(hostname, force, offline, rm_crush_entry) raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str()) diff --git a/src/pybind/mgr/test_orchestrator/module.py b/src/pybind/mgr/test_orchestrator/module.py index d89c23bf159..a0721250c7f 100644 --- a/src/pybind/mgr/test_orchestrator/module.py +++ b/src/pybind/mgr/test_orchestrator/module.py @@ -284,7 +284,7 @@ class TestOrchestrator(MgrModule, orchestrator.Orchestrator): return '' @handle_orch_error - def remove_host(self, host, force: bool, offline: bool): + def remove_host(self, host, force: bool, offline: bool, rm_crush_entry: bool): assert isinstance(host, str) return 'done' -- 2.39.5