mgr/cephadm: add more aggressive force flag for host maintenance enter

author Adam King <adking@redhat.com>

Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)

committer Adam King <adking@redhat.com>

Tue, 7 Mar 2023 19:05:35 +0000 (14:05 -0500)
author Adam King <adking@redhat.com>
Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)
committer Adam King <adking@redhat.com>
Tue, 7 Mar 2023 19:05:35 +0000 (14:05 -0500)
diff --git a/doc/cephadm/host-management.rst b/doc/cephadm/host-management.rst

index 08fdea281d045bda431c5103ef2b7d81c8589ebd..189e16abec2b29bf69a5bc5598265fa22f4b4305 100644 (file)
--- a/doc/cephadm/host-management.rst
+++ b/doc/cephadm/host-management.rst
@@ -206,10 +206,18 @@ Place a host in and out of maintenance mode (stops all Ceph daemons on host):
  
  .. prompt:: bash #
  
-   ceph orch host maintenance enter <hostname> [--force]
+   ceph orch host maintenance enter <hostname> [--force] [--yes-i-really-mean-it]
     ceph orch host maintenance exit <hostname>
  
-Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
+The ``--force`` flag allows the user to bypass warnings (but not alerts). The ``--yes-i-really-mean-it``
+flag bypasses all safety checks and will attempt to force the host into maintenance mode no
+matter what.
+
+.. warning:: Using the --yes-i-really-mean-it flag to force the host to enter maintenance
+   mode can potentially cause loss of data availability, the mon quorum to break down due
+   to too few running monitors, mgr module commands (such as ``ceph orch . . .`` commands)
+   to be become unresponsive, and a number of other possible issues. Please only use this
+   flag if you're absolutely certain you know what you're doing.
  
  See also :ref:`cephadm-fqdn`
  
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 4d90bfcaf051cad8333a2763879b595f62285edc..158eb2926b5f3a625a5f279fff113cdf41aba2f2 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1752,7 +1752,7 @@ Then run the following:
  
      @handle_orch_error
      @host_exists()
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> str:
          """ Attempt to place a cluster host in maintenance
  
          Placing a host into maintenance disables the cluster's ceph target in systemd
@@ -1764,11 +1764,14 @@ Then run the following:
  
          :raises OrchestratorError: Hostname is invalid, host is already in maintenance
          """
-        if len(self.cache.get_hosts()) == 1:
+        if yes_i_really_mean_it and not force:
+            raise OrchestratorError("--force must be passed with --yes-i-really-mean-it")
+
+        if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it:
              raise OrchestratorError("Maintenance feature is not supported on single node clusters")
  
          # if upgrade is active, deny
-        if self.upgrade.upgrade_state:
+        if self.upgrade.upgrade_state and not yes_i_really_mean_it:
              raise OrchestratorError(
                  f"Unable to place {hostname} in maintenance with upgrade active/paused")
  
@@ -1782,7 +1785,7 @@ Then run the following:
              # daemons on this host, so check the daemons can be stopped
              # and if so, place the host into maintenance by disabling the target
              rc, msg = self._host_ok_to_stop(hostname, force)
-            if rc:
+            if rc and not yes_i_really_mean_it:
                  raise OrchestratorError(
                      msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
  
@@ -1791,7 +1794,7 @@ Then run the following:
                                                                                  ["enter"],
                                                                                  error_ok=True))
              returned_msg = _err[0].split('\n')[-1]
-            if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
+            if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it:
                  raise OrchestratorError(
                      f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}")
  
@@ -1803,12 +1806,12 @@ Then run the following:
                      'who': [crush_node],
                      'format': 'json'
                  })
-                if rc:
+                if rc and not yes_i_really_mean_it:
                      self.log.warning(
                          f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})")
                      raise OrchestratorError(
                          f"Unable to set the osds on {hostname} to noout (rc={rc})")
-                else:
+                elif not rc:
                      self.log.info(
                          f"maintenance mode request for {hostname} has SET the noout group")
  
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py

index f3132c1275043459728ad1b58aa8f10e17d71cf5..80f2c3443dffd183915999eb419e2a6f4a1f5e92 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1716,6 +1716,33 @@ class TestCephadm(object):
  
          assert not cephadm_module.inventory._inventory[hostname]['status']
  
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    @mock.patch("cephadm.CephadmOrchestrator._host_ok_to_stop")
+    @mock.patch("cephadm.module.HostCache.get_daemon_types")
+    @mock.patch("cephadm.module.HostCache.get_hosts")
+    def test_maintenance_enter_i_really_mean_it(self, _hosts, _get_daemon_types, _host_ok, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        hostname = 'host1'
+        err_str = 'some kind of error'
+        _run_cephadm.side_effect = async_side_effect(
+            ([''], ['something\nfailed - disable the target'], 0))
+        _host_ok.return_value = 1, err_str
+        _get_daemon_types.return_value = ['mon']
+        _hosts.return_value = [hostname, 'other_host']
+        cephadm_module.inventory.add_host(HostSpec(hostname))
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname, force=True)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        retval = cephadm_module.enter_host_maintenance(hostname, force=True, yes_i_really_mean_it=True)
+        assert retval.result_str().startswith('Daemons for Ceph cluster')
+        assert not retval.exception_str
+        assert cephadm_module.inventory._inventory[hostname]['status'] == 'maintenance'
+
      @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
      @mock.patch("cephadm.module.HostCache.get_daemon_types")
      @mock.patch("cephadm.module.HostCache.get_hosts")
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py

index d08e76d8d791aed932df36fbe3e4fb4755c34c76..18854d098359ad3865a3ce8ab9d8a20e6d9ca5b0 100644 (file)
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -406,7 +406,7 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> OrchResult:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult:
          """
          Place a host in maintenance, stopping daemons and disabling it's systemd target
          """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py

index 607cd4fad79c601e45f4a894d30fbfaaa66df99f..4c9ef92114352dcfede11b6992f7c5bfa7ac304e 100644 (file)
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -566,11 +566,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
          return HandleCommandResult(stdout=completion.result_str())
  
      @_cli_write_command('orch host maintenance enter')
-    def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
+    def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
          """
          Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
          """
-        completion = self.enter_host_maintenance(hostname, force=force)
+        completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it)
          raise_if_exception(completion)
  
          return HandleCommandResult(stdout=completion.result_str())
author	Adam King <adking@redhat.com>
	Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)
committer	Adam King <adking@redhat.com>
	Tue, 7 Mar 2023 19:05:35 +0000 (14:05 -0500)
doc/cephadm/host-management.rst		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/test_cephadm.py		patch \| blob \| history
src/pybind/mgr/orchestrator/_interface.py		patch \| blob \| history
src/pybind/mgr/orchestrator/module.py		patch \| blob \| history