]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: add more aggressive force flag for host maintenance enter 50328/head
authorAdam King <adking@redhat.com>
Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)
committerAdam King <adking@redhat.com>
Tue, 7 Mar 2023 19:05:35 +0000 (14:05 -0500)
Right now, we have safety checks that will either say things are okay to stop,
return warnings, or returns "alerts". Warnings can be bypassed already with
the --force flag that exists for the command. However, the alerts cannot be
bypassed and cephadm will refuse to attempt to put the host in maintenance mode.
The idea here is for an additional flag that also bypasses that alerts, in cases
where a user really needs to put the host into maintenance mode even if that
means causing issues within the cluster.

Fixes: https://tracker.ceph.com/issues/58892
Signed-off-by: Adam King <adking@redhat.com>
doc/cephadm/host-management.rst
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/module.py

index 08fdea281d045bda431c5103ef2b7d81c8589ebd..189e16abec2b29bf69a5bc5598265fa22f4b4305 100644 (file)
@@ -206,10 +206,18 @@ Place a host in and out of maintenance mode (stops all Ceph daemons on host):
 
 .. prompt:: bash #
 
-   ceph orch host maintenance enter <hostname> [--force]
+   ceph orch host maintenance enter <hostname> [--force] [--yes-i-really-mean-it]
    ceph orch host maintenance exit <hostname>
 
-Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
+The ``--force`` flag allows the user to bypass warnings (but not alerts). The ``--yes-i-really-mean-it``
+flag bypasses all safety checks and will attempt to force the host into maintenance mode no
+matter what.
+
+.. warning:: Using the --yes-i-really-mean-it flag to force the host to enter maintenance
+   mode can potentially cause loss of data availability, the mon quorum to break down due
+   to too few running monitors, mgr module commands (such as ``ceph orch . . .`` commands)
+   to be become unresponsive, and a number of other possible issues. Please only use this
+   flag if you're absolutely certain you know what you're doing.
 
 See also :ref:`cephadm-fqdn`
 
index 4d90bfcaf051cad8333a2763879b595f62285edc..158eb2926b5f3a625a5f279fff113cdf41aba2f2 100644 (file)
@@ -1752,7 +1752,7 @@ Then run the following:
 
     @handle_orch_error
     @host_exists()
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> str:
         """ Attempt to place a cluster host in maintenance
 
         Placing a host into maintenance disables the cluster's ceph target in systemd
@@ -1764,11 +1764,14 @@ Then run the following:
 
         :raises OrchestratorError: Hostname is invalid, host is already in maintenance
         """
-        if len(self.cache.get_hosts()) == 1:
+        if yes_i_really_mean_it and not force:
+            raise OrchestratorError("--force must be passed with --yes-i-really-mean-it")
+
+        if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it:
             raise OrchestratorError("Maintenance feature is not supported on single node clusters")
 
         # if upgrade is active, deny
-        if self.upgrade.upgrade_state:
+        if self.upgrade.upgrade_state and not yes_i_really_mean_it:
             raise OrchestratorError(
                 f"Unable to place {hostname} in maintenance with upgrade active/paused")
 
@@ -1782,7 +1785,7 @@ Then run the following:
             # daemons on this host, so check the daemons can be stopped
             # and if so, place the host into maintenance by disabling the target
             rc, msg = self._host_ok_to_stop(hostname, force)
-            if rc:
+            if rc and not yes_i_really_mean_it:
                 raise OrchestratorError(
                     msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
 
@@ -1791,7 +1794,7 @@ Then run the following:
                                                                                 ["enter"],
                                                                                 error_ok=True))
             returned_msg = _err[0].split('\n')[-1]
-            if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
+            if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it:
                 raise OrchestratorError(
                     f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}")
 
@@ -1803,12 +1806,12 @@ Then run the following:
                     'who': [crush_node],
                     'format': 'json'
                 })
-                if rc:
+                if rc and not yes_i_really_mean_it:
                     self.log.warning(
                         f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})")
                     raise OrchestratorError(
                         f"Unable to set the osds on {hostname} to noout (rc={rc})")
-                else:
+                elif not rc:
                     self.log.info(
                         f"maintenance mode request for {hostname} has SET the noout group")
 
index f3132c1275043459728ad1b58aa8f10e17d71cf5..80f2c3443dffd183915999eb419e2a6f4a1f5e92 100644 (file)
@@ -1716,6 +1716,33 @@ class TestCephadm(object):
 
         assert not cephadm_module.inventory._inventory[hostname]['status']
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    @mock.patch("cephadm.CephadmOrchestrator._host_ok_to_stop")
+    @mock.patch("cephadm.module.HostCache.get_daemon_types")
+    @mock.patch("cephadm.module.HostCache.get_hosts")
+    def test_maintenance_enter_i_really_mean_it(self, _hosts, _get_daemon_types, _host_ok, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        hostname = 'host1'
+        err_str = 'some kind of error'
+        _run_cephadm.side_effect = async_side_effect(
+            ([''], ['something\nfailed - disable the target'], 0))
+        _host_ok.return_value = 1, err_str
+        _get_daemon_types.return_value = ['mon']
+        _hosts.return_value = [hostname, 'other_host']
+        cephadm_module.inventory.add_host(HostSpec(hostname))
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname, force=True)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        retval = cephadm_module.enter_host_maintenance(hostname, force=True, yes_i_really_mean_it=True)
+        assert retval.result_str().startswith('Daemons for Ceph cluster')
+        assert not retval.exception_str
+        assert cephadm_module.inventory._inventory[hostname]['status'] == 'maintenance'
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
     @mock.patch("cephadm.module.HostCache.get_daemon_types")
     @mock.patch("cephadm.module.HostCache.get_hosts")
index d08e76d8d791aed932df36fbe3e4fb4755c34c76..18854d098359ad3865a3ce8ab9d8a20e6d9ca5b0 100644 (file)
@@ -406,7 +406,7 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> OrchResult:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult:
         """
         Place a host in maintenance, stopping daemons and disabling it's systemd target
         """
index 607cd4fad79c601e45f4a894d30fbfaaa66df99f..4c9ef92114352dcfede11b6992f7c5bfa7ac304e 100644 (file)
@@ -566,11 +566,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
         return HandleCommandResult(stdout=completion.result_str())
 
     @_cli_write_command('orch host maintenance enter')
-    def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
+    def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
         """
         Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
         """
-        completion = self.enter_host_maintenance(hostname, force=force)
+        completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it)
         raise_if_exception(completion)
 
         return HandleCommandResult(stdout=completion.result_str())