From dcebce1d114257b9676bb23e2be57778ee39fd8e Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 11 Jan 2021 13:23:04 -0500 Subject: [PATCH] mgr/cephadm: force flag for ok-to-stop and ok-to-stop for monitoring stack Daemons that could cause data loss when stopped will always block. Daemons that will only cause loss in availability should block but have a workaround in the form of a force flag if the user is okay with the service being down. Also implements ok-to-stop for monitoring stack daemons that uses this system of blocking on availability loss unless force flag is provided Signed-off-by: Adam King Signed-off-by: Daniel-Pivonka (cherry picked from commit aeeffb07f1bb226b64a362de4eecd7d63fc0b0d1) --- doc/mgr/orchestrator.rst | 7 ++++ src/pybind/mgr/cephadm/module.py | 33 ++++++++++++++----- src/pybind/mgr/cephadm/serve.py | 3 +- .../mgr/cephadm/services/cephadmservice.py | 23 ++++++++++++- src/pybind/mgr/cephadm/services/monitoring.py | 27 +++++++++++++++ src/pybind/mgr/cephadm/tests/test_cephadm.py | 2 +- src/pybind/mgr/cephadm/upgrade.py | 6 ++-- src/pybind/mgr/orchestrator/__init__.py | 2 +- src/pybind/mgr/orchestrator/_interface.py | 2 +- src/pybind/mgr/orchestrator/module.py | 4 +-- 10 files changed, 92 insertions(+), 17 deletions(-) diff --git a/doc/mgr/orchestrator.rst b/doc/mgr/orchestrator.rst index 11c81004a72c4..d8009527ea611 100644 --- a/doc/mgr/orchestrator.rst +++ b/doc/mgr/orchestrator.rst @@ -72,6 +72,13 @@ Add and remove hosts:: ceph orch host add [] [...] ceph orch host rm +Place a host in and out of maintenance mode (stops all Ceph daemons on host):: + + ceph orch host maintenance enter [--force] + ceph orch host maintenace exit + +Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts) + For cephadm, see also :ref:`cephadm-fqdn` and :ref:`cephadm-removing-hosts`. Host Specification diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 2e88dafe67511..a6ffdb4a037be 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1256,7 +1256,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.log.info('Removed label %s to host %s' % (label, host)) return 'Removed label %s from host %s' % (label, host) - def _host_ok_to_stop(self, hostname: str) -> Tuple[int, str]: + def _host_ok_to_stop(self, hostname: str, force: bool = False) -> Tuple[int, str]: self.log.debug("running host-ok-to-stop checks") daemons = self.cache.get_daemons() daemon_map: Dict[str, List[str]] = defaultdict(lambda: []) @@ -1267,12 +1267,28 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, if dd.hostname == hostname: daemon_map[dd.daemon_type].append(dd.daemon_id) + notifications: List[str] = [] + error_notifications: List[str] = [] + okay: bool = True for daemon_type, daemon_ids in daemon_map.items(): - r = self.cephadm_services[daemon_type_to_service(daemon_type)].ok_to_stop(daemon_ids) + r = self.cephadm_services[daemon_type_to_service( + daemon_type)].ok_to_stop(daemon_ids, force) if r.retval: - self.log.error(f'It is NOT safe to stop host {hostname}') - return r.retval, r.stderr - + okay = False + # collect error notifications so user can see every daemon causing host + # to not be okay to stop + error_notifications.append(r.stderr) + if r.stdout: + # if extra notifications to print for user, add them to notifications list + notifications.append(r.stdout) + + if not okay: + # at least one daemon is not okay to stop + return 1, '\n'.join(error_notifications) + + if notifications: + return 0, (f'It is presumed safe to stop host {hostname}. ' + + 'Note the following:\n\n' + '\n'.join(notifications)) return 0, f'It is presumed safe to stop host {hostname}' @trivial_completion @@ -1304,7 +1320,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, @trivial_completion @host_exists() - def enter_host_maintenance(self, hostname: str) -> str: + def enter_host_maintenance(self, hostname: str, force: bool = False) -> str: """ Attempt to place a cluster host in maintenance Placing a host into maintenance disables the cluster's ceph target in systemd @@ -1333,9 +1349,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, if host_daemons: # daemons on this host, so check the daemons can be stopped # and if so, place the host into maintenance by disabling the target - rc, msg = self._host_ok_to_stop(hostname) + rc, msg = self._host_ok_to_stop(hostname, force) if rc: - raise OrchestratorError(msg, errno=rc) + raise OrchestratorError( + msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc) # call the host-maintenance function _out, _err, _code = CephadmServe(self)._run_cephadm(hostname, cephadmNoImage, "host-maintenance", diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 2b54be22b92d3..f988295fda04a 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -574,7 +574,8 @@ class CephadmServe: def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool: daemon_ids = [d.daemon_id for d in remove_daemon_hosts] assert None not in daemon_ids - r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids)) + # setting force flag retains previous behavior, should revisit later. + r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids), force=True) return not r.retval while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts): diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 4632d5a0fa358..d9fee1270ab0b 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -208,7 +208,7 @@ class CephadmService(metaclass=ABCMeta): except MonCommandFailed as e: logger.warning('Failed to set Dashboard config for %s: %s', service_name, e) - def ok_to_stop(self, daemon_ids: List[str]) -> HandleCommandResult: + def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult: names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids] out = f'It is presumed safe to stop {names}' err = f'It is NOT safe to stop {names}' @@ -231,6 +231,27 @@ class CephadmService(metaclass=ABCMeta): logger.info(out) return HandleCommandResult(r.retval, out, r.stderr) + def _enough_daemons_to_stop(self, daemon_type: str, daemon_ids: List[str], service: str, low_limit: int) -> Tuple[bool, str]: + # Provides a warning about if it possible or not to stop daemons in a service + names = [f'{daemon_type}.{d_id}' for d_id in daemon_ids] + number_of_running_daemons = len( + [daemon for daemon in self.mgr.cache.get_daemons_by_type(daemon_type) if daemon.status == 1]) + if (number_of_running_daemons - len(daemon_ids)) >= low_limit: + return False, f'It is presumed safe to stop {names}' + + num_daemons_left = number_of_running_daemons - len(daemon_ids) + + def plural(count: int) -> str: + return 'daemon' if count == 1 else 'daemons' + + daemon_count = "only" if number_of_running_daemons == 1 else number_of_running_daemons + left_count = "no" if num_daemons_left == 0 else num_daemons_left + + out = (f'WARNING: Stopping {len(daemon_ids)} out of {number_of_running_daemons} daemons in {service} service. ' + f'Service will not be operational with {left_count} {plural(num_daemons_left)} left. ' + f'At least {low_limit} {plural(low_limit)} must be running to guarantee service. ') + return True, out + def pre_remove(self, daemon: DaemonDescription) -> None: """ Called before the daemon is removed. diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index cf6823e2fe120..d0aeb2acaffe6 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -1,7 +1,10 @@ +import errno import logging import os from typing import List, Any, Tuple, Dict +from mgr_module import HandleCommandResult + from orchestrator import DaemonDescription from ceph.deployment.service_spec import AlertManagerSpec from cephadm.services.cephadmservice import CephadmService, CephadmDaemonSpec @@ -80,6 +83,12 @@ class GrafanaService(CephadmService): service_url ) + def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult: + warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1) + if warn and not force: + return HandleCommandResult(-errno.EBUSY, None, warn_message) + return HandleCommandResult(0, warn_message, None) + class AlertmanagerService(CephadmService): TYPE = 'alertmanager' @@ -165,6 +174,12 @@ class AlertmanagerService(CephadmService): service_url ) + def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult: + warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1) + if warn and not force: + return HandleCommandResult(-errno.EBUSY, None, warn_message) + return HandleCommandResult(0, warn_message, None) + class PrometheusService(CephadmService): TYPE = 'prometheus' @@ -263,6 +278,12 @@ class PrometheusService(CephadmService): service_url ) + def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult: + warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1) + if warn and not force: + return HandleCommandResult(-errno.EBUSY, None, warn_message) + return HandleCommandResult(0, warn_message, None) + class NodeExporterService(CephadmService): TYPE = 'node-exporter' @@ -274,3 +295,9 @@ class NodeExporterService(CephadmService): def generate_config(self, daemon_spec: CephadmDaemonSpec) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type return {}, [] + + def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult: + # since node exporter runs on each host and cannot compromise data, no extra checks required + names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids] + out = f'It is presumed safe to stop {names}' + return HandleCommandResult(0, out, None) diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index ee62093bf2886..840e82d0167e5 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -776,7 +776,7 @@ class TestCephadm(object): match_glob(out, "Scheduled mds.fsname update...") CephadmServe(cephadm_module)._apply_all_services() - ok_to_stop.assert_called_with([daemon[4:]]) + ok_to_stop.assert_called_with([daemon[4:]], force=True) assert_rm_daemon(cephadm_module, spec.service_name(), 'host1') # verifies ok-to-stop assert_rm_daemon(cephadm_module, spec.service_name(), 'host2') diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index f9d1ba11ec98c..4e21a99547a13 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Optional, Dict import orchestrator from cephadm.serve import CephadmServe from cephadm.utils import name_to_config_section, CEPH_UPGRADE_ORDER -from orchestrator import OrchestratorError, DaemonDescription +from orchestrator import OrchestratorError, DaemonDescription, daemon_type_to_service, service_to_daemon_types if TYPE_CHECKING: from .module import CephadmOrchestrator @@ -179,7 +179,9 @@ class CephadmUpgrade: if not self.upgrade_state or self.upgrade_state.paused: return False - r = self.mgr.cephadm_services[s.daemon_type].ok_to_stop([s.daemon_id]) + # setting force flag to retain old functionality. + r = self.mgr.cephadm_services[daemon_type_to_service(s.daemon_type)].ok_to_stop([ + s.daemon_id], force=True) if not r.retval: logger.info(f'Upgrade: {r.stdout}') diff --git a/src/pybind/mgr/orchestrator/__init__.py b/src/pybind/mgr/orchestrator/__init__.py index 2d6cba8a15626..4bcb26cabfa3a 100644 --- a/src/pybind/mgr/orchestrator/__init__.py +++ b/src/pybind/mgr/orchestrator/__init__.py @@ -16,4 +16,4 @@ from ._interface import \ DaemonDescription, \ OrchestratorEvent, set_exception_subject, \ InventoryHost, DeviceLightLoc, \ - UpgradeStatusSpec + UpgradeStatusSpec, daemon_type_to_service, service_to_daemon_types diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index cc726d52d7da2..6034e219daf0b 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -839,7 +839,7 @@ class Orchestrator(object): """ raise NotImplementedError() - def enter_host_maintenance(self, hostname: str) -> Completion: + def enter_host_maintenance(self, hostname: str, force: bool = False) -> Completion: """ Place a host in maintenance, stopping daemons and disabling it's systemd target """ diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index c0ed80efd826c..3859d11c155d2 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -394,11 +394,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, @_cli_write_command( 'orch host maintenance enter') - def _host_maintenance_enter(self, hostname: str) -> HandleCommandResult: + def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult: """ Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only) """ - completion = self.enter_host_maintenance(hostname) + completion = self.enter_host_maintenance(hostname, force=force) self._orchestrator_wait([completion]) raise_if_exception(completion) -- 2.39.5