mgr/cephadm: force flag for ok-to-stop and ok-to-stop for monitoring stack

author Adam King <adking@redhat.com>

Mon, 11 Jan 2021 18:23:04 +0000 (13:23 -0500)

committer Sebastian Wagner <sebastian.wagner@suse.com>

Fri, 29 Jan 2021 12:42:38 +0000 (13:42 +0100)
author Adam King <adking@redhat.com>
Mon, 11 Jan 2021 18:23:04 +0000 (13:23 -0500)
committer Sebastian Wagner <sebastian.wagner@suse.com>
Fri, 29 Jan 2021 12:42:38 +0000 (13:42 +0100)
diff --git a/doc/mgr/orchestrator.rst b/doc/mgr/orchestrator.rst

index 11c81004a72c476c5eea066cd7404877220eb7d1..d8009527ea611671b770e947045797859b3d3320 100644 (file)
--- a/doc/mgr/orchestrator.rst
+++ b/doc/mgr/orchestrator.rst
@@ -72,6 +72,13 @@ Add and remove hosts::
      ceph orch host add <hostname> [<addr>] [<labels>...]
      ceph orch host rm <hostname>
  
+Place a host in and out of maintenance mode (stops all Ceph daemons on host)::
+
+    ceph orch host maintenance enter <hostname> [--force]
+    ceph orch host maintenace exit <hostname>
+
+Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
+
  For cephadm, see also :ref:`cephadm-fqdn` and :ref:`cephadm-removing-hosts`.
  
  Host Specification
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 2e88dafe6751101bed6d3cab2c024c8fbc4b50b0..a6ffdb4a037be383e43f5b8c03f58502757cfc60 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1256,7 +1256,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
          self.log.info('Removed label %s to host %s' % (label, host))
          return 'Removed label %s from host %s' % (label, host)
  
-    def _host_ok_to_stop(self, hostname: str) -> Tuple[int, str]:
+    def _host_ok_to_stop(self, hostname: str, force: bool = False) -> Tuple[int, str]:
          self.log.debug("running host-ok-to-stop checks")
          daemons = self.cache.get_daemons()
          daemon_map: Dict[str, List[str]] = defaultdict(lambda: [])
@@ -1267,12 +1267,28 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
              if dd.hostname == hostname:
                  daemon_map[dd.daemon_type].append(dd.daemon_id)
  
+        notifications: List[str] = []
+        error_notifications: List[str] = []
+        okay: bool = True
          for daemon_type, daemon_ids in daemon_map.items():
-            r = self.cephadm_services[daemon_type_to_service(daemon_type)].ok_to_stop(daemon_ids)
+            r = self.cephadm_services[daemon_type_to_service(
+                daemon_type)].ok_to_stop(daemon_ids, force)
              if r.retval:
-                self.log.error(f'It is NOT safe to stop host {hostname}')
-                return r.retval, r.stderr
-
+                okay = False
+                # collect error notifications so user can see every daemon causing host
+                # to not be okay to stop
+                error_notifications.append(r.stderr)
+            if r.stdout:
+                # if extra notifications to print for user, add them to notifications list
+                notifications.append(r.stdout)
+
+        if not okay:
+            # at least one daemon is not okay to stop
+            return 1, '\n'.join(error_notifications)
+
+        if notifications:
+            return 0, (f'It is presumed safe to stop host {hostname}. ' +
+                       'Note the following:\n\n' + '\n'.join(notifications))
          return 0, f'It is presumed safe to stop host {hostname}'
  
      @trivial_completion
@@ -1304,7 +1320,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
  
      @trivial_completion
      @host_exists()
-    def enter_host_maintenance(self, hostname: str) -> str:
+    def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
          """ Attempt to place a cluster host in maintenance
  
          Placing a host into maintenance disables the cluster's ceph target in systemd
@@ -1333,9 +1349,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
          if host_daemons:
              # daemons on this host, so check the daemons can be stopped
              # and if so, place the host into maintenance by disabling the target
-            rc, msg = self._host_ok_to_stop(hostname)
+            rc, msg = self._host_ok_to_stop(hostname, force)
              if rc:
-                raise OrchestratorError(msg, errno=rc)
+                raise OrchestratorError(
+                    msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
  
              # call the host-maintenance function
              _out, _err, _code = CephadmServe(self)._run_cephadm(hostname, cephadmNoImage, "host-maintenance",
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py

index 2b54be22b92d357191f018266578d8bfa7df41ac..f988295fda04aff8c6f75686a51bb6904f679034 100644 (file)
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -574,7 +574,8 @@ class CephadmServe:
          def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool:
              daemon_ids = [d.daemon_id for d in remove_daemon_hosts]
              assert None not in daemon_ids
-            r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids))
+            # setting force flag retains previous behavior, should revisit later.
+            r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids), force=True)
              return not r.retval
  
          while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts):
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py

index 4632d5a0fa35824763aa60173f1f8d06ea91d9c0..d9fee1270ab0bf534f44f6fc420416f78a3f004c 100644 (file)
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -208,7 +208,7 @@ class CephadmService(metaclass=ABCMeta):
              except MonCommandFailed as e:
                  logger.warning('Failed to set Dashboard config for %s: %s', service_name, e)
  
-    def ok_to_stop(self, daemon_ids: List[str]) -> HandleCommandResult:
+    def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
          names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
          out = f'It is presumed safe to stop {names}'
          err = f'It is NOT safe to stop {names}'
@@ -231,6 +231,27 @@ class CephadmService(metaclass=ABCMeta):
          logger.info(out)
          return HandleCommandResult(r.retval, out, r.stderr)
  
+    def _enough_daemons_to_stop(self, daemon_type: str, daemon_ids: List[str], service: str, low_limit: int) -> Tuple[bool, str]:
+        # Provides a warning about if it possible or not to stop <n> daemons in a service
+        names = [f'{daemon_type}.{d_id}' for d_id in daemon_ids]
+        number_of_running_daemons = len(
+            [daemon for daemon in self.mgr.cache.get_daemons_by_type(daemon_type) if daemon.status == 1])
+        if (number_of_running_daemons - len(daemon_ids)) >= low_limit:
+            return False, f'It is presumed safe to stop {names}'
+
+        num_daemons_left = number_of_running_daemons - len(daemon_ids)
+
+        def plural(count: int) -> str:
+            return 'daemon' if count == 1 else 'daemons'
+
+        daemon_count = "only" if number_of_running_daemons == 1 else number_of_running_daemons
+        left_count = "no" if num_daemons_left == 0 else num_daemons_left
+
+        out = (f'WARNING: Stopping {len(daemon_ids)} out of {number_of_running_daemons} daemons in {service} service. '
+               f'Service will not be operational with {left_count} {plural(num_daemons_left)} left. '
+               f'At least {low_limit} {plural(low_limit)} must be running to guarantee service. ')
+        return True, out
+
      def pre_remove(self, daemon: DaemonDescription) -> None:
          """
          Called before the daemon is removed.
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py

index cf6823e2fe120e38711d8e03f8a842f9d36fb2a0..d0aeb2acaffe6f6e7e5ad415715b650f7b4d4758 100644 (file)
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -1,7 +1,10 @@
+import errno
  import logging
  import os
  from typing import List, Any, Tuple, Dict
  
+from mgr_module import HandleCommandResult
+
  from orchestrator import DaemonDescription
  from ceph.deployment.service_spec import AlertManagerSpec
  from cephadm.services.cephadmservice import CephadmService, CephadmDaemonSpec
@@ -80,6 +83,12 @@ class GrafanaService(CephadmService):
              service_url
          )
  
+    def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
+        if warn and not force:
+            return HandleCommandResult(-errno.EBUSY, None, warn_message)
+        return HandleCommandResult(0, warn_message, None)
+
  
  class AlertmanagerService(CephadmService):
      TYPE = 'alertmanager'
@@ -165,6 +174,12 @@ class AlertmanagerService(CephadmService):
              service_url
          )
  
+    def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
+        if warn and not force:
+            return HandleCommandResult(-errno.EBUSY, None, warn_message)
+        return HandleCommandResult(0, warn_message, None)
+
  
  class PrometheusService(CephadmService):
      TYPE = 'prometheus'
@@ -263,6 +278,12 @@ class PrometheusService(CephadmService):
              service_url
          )
  
+    def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
+        if warn and not force:
+            return HandleCommandResult(-errno.EBUSY, None, warn_message)
+        return HandleCommandResult(0, warn_message, None)
+
  
  class NodeExporterService(CephadmService):
      TYPE = 'node-exporter'
@@ -274,3 +295,9 @@ class NodeExporterService(CephadmService):
      def generate_config(self, daemon_spec: CephadmDaemonSpec) -> Tuple[Dict[str, Any], List[str]]:
          assert self.TYPE == daemon_spec.daemon_type
          return {}, []
+
+    def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
+        # since node exporter runs on each host and cannot compromise data, no extra checks required
+        names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
+        out = f'It is presumed safe to stop {names}'
+        return HandleCommandResult(0, out, None)
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py

index ee62093bf2886fdfc8f711d58bcc8015c05893ad..840e82d0167e51bb58168204014d5a4464eb314c 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -776,7 +776,7 @@ class TestCephadm(object):
              match_glob(out, "Scheduled mds.fsname update...")
              CephadmServe(cephadm_module)._apply_all_services()
  
-            ok_to_stop.assert_called_with([daemon[4:]])
+            ok_to_stop.assert_called_with([daemon[4:]], force=True)
  
              assert_rm_daemon(cephadm_module, spec.service_name(), 'host1')  # verifies ok-to-stop
              assert_rm_daemon(cephadm_module, spec.service_name(), 'host2')
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py

index f9d1ba11ec98c659b0b19a090fb9c2078f77cb6d..4e21a99547a13310ff42380674269f701b9886f2 100644 (file)
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Optional, Dict
  import orchestrator
  from cephadm.serve import CephadmServe
  from cephadm.utils import name_to_config_section, CEPH_UPGRADE_ORDER
-from orchestrator import OrchestratorError, DaemonDescription
+from orchestrator import OrchestratorError, DaemonDescription, daemon_type_to_service, service_to_daemon_types
  
  if TYPE_CHECKING:
      from .module import CephadmOrchestrator
@@ -179,7 +179,9 @@ class CephadmUpgrade:
              if not self.upgrade_state or self.upgrade_state.paused:
                  return False
  
-            r = self.mgr.cephadm_services[s.daemon_type].ok_to_stop([s.daemon_id])
+            # setting force flag to retain old functionality.
+            r = self.mgr.cephadm_services[daemon_type_to_service(s.daemon_type)].ok_to_stop([
+                s.daemon_id], force=True)
  
              if not r.retval:
                  logger.info(f'Upgrade: {r.stdout}')
diff --git a/src/pybind/mgr/orchestrator/__init__.py b/src/pybind/mgr/orchestrator/__init__.py

index 2d6cba8a15626bdf0cad9e33cf01ab8b6203f4e0..4bcb26cabfa3a2e1b33e2536083300f99941f840 100644 (file)
--- a/src/pybind/mgr/orchestrator/__init__.py
+++ b/src/pybind/mgr/orchestrator/__init__.py
@@ -16,4 +16,4 @@ from ._interface import \
      DaemonDescription, \
      OrchestratorEvent, set_exception_subject, \
      InventoryHost, DeviceLightLoc, \
-    UpgradeStatusSpec
+    UpgradeStatusSpec, daemon_type_to_service, service_to_daemon_types
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py

index cc726d52d7da22449faf792011593991b10ab123..6034e219daf0b8887e8f715a9164c8b5a0e1d600 100644 (file)
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -839,7 +839,7 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def enter_host_maintenance(self, hostname: str) -> Completion:
+    def enter_host_maintenance(self, hostname: str, force: bool = False) -> Completion:
          """
          Place a host in maintenance, stopping daemons and disabling it's systemd target
          """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py

index c0ed80efd826cb1fa0c0a8c1dde6c55353620df9..3859d11c155d205dbbb9775a593c713e3d04cfbb 100644 (file)
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -394,11 +394,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
  
      @_cli_write_command(
          'orch host maintenance enter')
-    def _host_maintenance_enter(self, hostname: str) -> HandleCommandResult:
+    def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
          """
          Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
          """
-        completion = self.enter_host_maintenance(hostname)
+        completion = self.enter_host_maintenance(hostname, force=force)
          self._orchestrator_wait([completion])
          raise_if_exception(completion)
author	Adam King <adking@redhat.com>
	Mon, 11 Jan 2021 18:23:04 +0000 (13:23 -0500)
committer	Sebastian Wagner <sebastian.wagner@suse.com>
	Fri, 29 Jan 2021 12:42:38 +0000 (13:42 +0100)
doc/mgr/orchestrator.rst		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/serve.py		patch \| blob \| history
src/pybind/mgr/cephadm/services/cephadmservice.py		patch \| blob \| history
src/pybind/mgr/cephadm/services/monitoring.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/test_cephadm.py		patch \| blob \| history
src/pybind/mgr/cephadm/upgrade.py		patch \| blob \| history
src/pybind/mgr/orchestrator/__init__.py		patch \| blob \| history
src/pybind/mgr/orchestrator/_interface.py		patch \| blob \| history
src/pybind/mgr/orchestrator/module.py		patch \| blob \| history