From: Daniel Pivonka Date: Wed, 14 Jul 2021 13:27:59 +0000 (-0400) Subject: mgr/cephadm: add ability to remove offline host X-Git-Tag: v16.2.6~25^2~26 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=f2c30f72a0b79955bd4e3f62fe4ca6b71253629c;p=ceph.git mgr/cephadm: add ability to remove offline host Signed-off-by: Daniel Pivonka (cherry picked from commit 219887301dfdeac55c75a7105bf3a2851fe56387) --- diff --git a/doc/cephadm/host-management.rst b/doc/cephadm/host-management.rst index 35829fa4ac63d..621f2a753b1ed 100644 --- a/doc/cephadm/host-management.rst +++ b/doc/cephadm/host-management.rst @@ -94,6 +94,18 @@ Once all daemons are removed you can remove the host with the following: ceph orch host rm +Offline host removal +-------------------- + +If a host is offline and can not be recovered it can still be removed from the cluster with the following: + +.. prompt:: bash # + + ceph orch host rm --offline --force + +This can potentially cause data loss as osds will be forcefully purged from the cluster by calling ``osd purge-actual`` for each osd. +Service specs that still contain this host should be manually updated. + .. _orchestrator-host-labels: Host labels diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 82776ca4456df..0973b9a3505a0 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1467,35 +1467,80 @@ Then run the following: return self._add_host(spec) @handle_orch_error - def remove_host(self, host): - # type: (str) -> str + def remove_host(self, host: str, force: bool = False, offline: bool = False) -> str: """ Remove a host from orchestrator management. :param host: host name + :param force: bypass running daemons check + :param offline: remove offline host """ - # Verify if it is possible to remove the host safely - daemons = self.cache.get_daemons_by_host(host) - if daemons: - self.log.warning(f"Blocked {host} removal. Daemons running: {daemons}") - daemons_table = "" - daemons_table += "{:<20} {:<15}\n".format("type", "id") - daemons_table += "{:<20} {:<15}\n".format("-" * 20, "-" * 15) + # check if host is offline + host_offline = host in self.offline_hosts + + if host_offline and not offline: + return "{} is offline, please use --offline and --force to remove this host. This can potentially cause data loss".format(host) + + if not host_offline and offline: + return "{} is online, please remove host without --offline.".format(host) + + if offline and not force: + return "Removing an offline host requires --force" + + # check if there are daemons on the host + if not force: + daemons = self.cache.get_daemons_by_host(host) + if daemons: + self.log.warning(f"Blocked {host} removal. Daemons running: {daemons}") + + daemons_table = "" + daemons_table += "{:<20} {:<15}\n".format("type", "id") + daemons_table += "{:<20} {:<15}\n".format("-" * 20, "-" * 15) + for d in daemons: + daemons_table += "{:<20} {:<15}\n".format(d.daemon_type, d.daemon_id) + + return "Not allowed to remove %s from cluster. " \ + "The following daemons are running in the host:" \ + "\n%s\nPlease run 'ceph orch host drain %s' to remove daemons from host" % ( + host, daemons_table, host) + + def run_cmd(cmd_args: dict) -> None: + ret, out, err = self.mon_command(cmd_args) + if ret != 0: + self.log.debug(f"ran {cmd_args} with mon_command") + self.log.error( + f"cmd: {cmd_args.get('prefix')} failed with: {err}. (errno:{ret})") + self.log.debug(f"cmd: {cmd_args.get('prefix')} returns: {out}") + + if offline: + daemons = self.cache.get_daemons_by_host(host) for d in daemons: - daemons_table += "{:<20} {:<15}\n".format(d.daemon_type, d.daemon_id) + self.log.info(f"removing: {d.name()}") - return "Not allowed to remove %s from cluster. " \ - "The following daemons are running in the host:" \ - "\n%s\nPlease run 'ceph orch host drain %s' to remove daemons from host" % ( - host, daemons_table, host) + if d.daemon_type != 'osd': + self.cephadm_services[str(d.daemon_type)].pre_remove(d) + self.cephadm_services[str(d.daemon_type)].post_remove(d) + else: + cmd_args = { + 'prefix': 'osd purge-actual', + 'id': int(str(d.daemon_id)), + 'yes_i_really_mean_it': True + } + run_cmd(cmd_args) + + cmd_args = { + 'prefix': 'osd crush rm', + 'name': host + } + run_cmd(cmd_args) self.inventory.rm_host(host) self.cache.rm_host(host) self._reset_con(host) self.event.set() # refresh stray health check self.log.info('Removed host %s' % host) - return "Removed host '{}'".format(host) + return "Removed {} host '{}'".format('offline' if offline else '', host) @handle_orch_error def update_host_addr(self, host: str, addr: str) -> str: diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 31ecc02fd35af..22f5c5640dc5a 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -343,7 +343,7 @@ class Orchestrator(object): """ raise NotImplementedError() - def remove_host(self, host: str) -> OrchResult[str]: + def remove_host(self, host: str, force: bool, offline: bool) -> OrchResult[str]: """ Remove a host from the orchestrator inventory. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 156a7eabf8c83..a556c91c0bbf6 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -346,9 +346,9 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, return self._apply_misc([s], False, Format.plain) @_cli_write_command('orch host rm') - def _remove_host(self, hostname: str) -> HandleCommandResult: + def _remove_host(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult: """Remove a host""" - completion = self.remove_host(hostname) + completion = self.remove_host(hostname, force, offline) raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str()) diff --git a/src/pybind/mgr/test_orchestrator/module.py b/src/pybind/mgr/test_orchestrator/module.py index f5729e12242f6..9d172737777fe 100644 --- a/src/pybind/mgr/test_orchestrator/module.py +++ b/src/pybind/mgr/test_orchestrator/module.py @@ -284,7 +284,7 @@ class TestOrchestrator(MgrModule, orchestrator.Orchestrator): return '' @handle_orch_error - def remove_host(self, host): + def remove_host(self, host, force: bool, offline: bool): assert isinstance(host, str) return 'done'