mgr/cephadm: add more aggressive force flag for host maintenance enter

author Adam King <adking@redhat.com>

Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)

committer Adam King <adking@redhat.com>

Wed, 5 Apr 2023 19:37:19 +0000 (15:37 -0400)
author Adam King <adking@redhat.com>
Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)
committer Adam King <adking@redhat.com>
Wed, 5 Apr 2023 19:37:19 +0000 (15:37 -0400)
diff --git a/doc/cephadm/host-management.rst b/doc/cephadm/host-management.rst

index b2c514c8ca704eee3f3aa73fb34d7ac04493f4f9..c3e52f008ccd45ff4bf2db05c1ca6818484f42ee 100644 (file)
--- a/doc/cephadm/host-management.rst
+++ b/doc/cephadm/host-management.rst
@@ -193,10 +193,18 @@ Place a host in and out of maintenance mode (stops all Ceph daemons on host):
  
  .. prompt:: bash #
  
-   ceph orch host maintenance enter <hostname> [--force]
+   ceph orch host maintenance enter <hostname> [--force] [--yes-i-really-mean-it]
     ceph orch host maintenance exit <hostname>
  
-Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
+The ``--force`` flag allows the user to bypass warnings (but not alerts). The ``--yes-i-really-mean-it``
+flag bypasses all safety checks and will attempt to force the host into maintenance mode no
+matter what.
+
+.. warning:: Using the --yes-i-really-mean-it flag to force the host to enter maintenance
+   mode can potentially cause loss of data availability, the mon quorum to break down due
+   to too few running monitors, mgr module commands (such as ``ceph orch . . .`` commands)
+   to be become unresponsive, and a number of other possible issues. Please only use this
+   flag if you're absolutely certain you know what you're doing.
  
  See also :ref:`cephadm-fqdn`
  
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 6c8e7497f5540f7447ead8f6cd1c885619241177..59632fabb4eb6e266fedaa9b14fb54c32b41a64a 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1668,7 +1668,7 @@ Then run the following:
  
      @handle_orch_error
      @host_exists()
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> str:
          """ Attempt to place a cluster host in maintenance
  
          Placing a host into maintenance disables the cluster's ceph target in systemd
@@ -1680,11 +1680,14 @@ Then run the following:
  
          :raises OrchestratorError: Hostname is invalid, host is already in maintenance
          """
-        if len(self.cache.get_hosts()) == 1:
+        if yes_i_really_mean_it and not force:
+            raise OrchestratorError("--force must be passed with --yes-i-really-mean-it")
+
+        if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it:
              raise OrchestratorError("Maintenance feature is not supported on single node clusters")
  
          # if upgrade is active, deny
-        if self.upgrade.upgrade_state:
+        if self.upgrade.upgrade_state and not yes_i_really_mean_it:
              raise OrchestratorError(
                  f"Unable to place {hostname} in maintenance with upgrade active/paused")
  
@@ -1698,7 +1701,7 @@ Then run the following:
              # daemons on this host, so check the daemons can be stopped
              # and if so, place the host into maintenance by disabling the target
              rc, msg = self._host_ok_to_stop(hostname, force)
-            if rc:
+            if rc and not yes_i_really_mean_it:
                  raise OrchestratorError(
                      msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
  
@@ -1707,7 +1710,7 @@ Then run the following:
                                                                                  ["enter"],
                                                                                  error_ok=True))
              returned_msg = _err[0].split('\n')[-1]
-            if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
+            if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it:
                  raise OrchestratorError(
                      f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}")
  
@@ -1719,12 +1722,12 @@ Then run the following:
                      'who': [crush_node],
                      'format': 'json'
                  })
-                if rc:
+                if rc and not yes_i_really_mean_it:
                      self.log.warning(
                          f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})")
                      raise OrchestratorError(
                          f"Unable to set the osds on {hostname} to noout (rc={rc})")
-                else:
+                elif not rc:
                      self.log.info(
                          f"maintenance mode request for {hostname} has SET the noout group")
  
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py

index 33185cad6346dba46b158c2dfda7618510cdd94a..92570f852f2e6ee9e1e2521d1d672b87a67b6977 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1774,6 +1774,33 @@ class TestCephadm(object):
  
          assert not cephadm_module.inventory._inventory[hostname]['status']
  
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    @mock.patch("cephadm.CephadmOrchestrator._host_ok_to_stop")
+    @mock.patch("cephadm.module.HostCache.get_daemon_types")
+    @mock.patch("cephadm.module.HostCache.get_hosts")
+    def test_maintenance_enter_i_really_mean_it(self, _hosts, _get_daemon_types, _host_ok, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        hostname = 'host1'
+        err_str = 'some kind of error'
+        _run_cephadm.side_effect = async_side_effect(
+            ([''], ['something\nfailed - disable the target'], 0))
+        _host_ok.return_value = 1, err_str
+        _get_daemon_types.return_value = ['mon']
+        _hosts.return_value = [hostname, 'other_host']
+        cephadm_module.inventory.add_host(HostSpec(hostname))
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        with pytest.raises(OrchestratorError, match=err_str):
+            cephadm_module.enter_host_maintenance(hostname, force=True)
+        assert not cephadm_module.inventory._inventory[hostname]['status']
+
+        retval = cephadm_module.enter_host_maintenance(hostname, force=True, yes_i_really_mean_it=True)
+        assert retval.result_str().startswith('Daemons for Ceph cluster')
+        assert not retval.exception_str
+        assert cephadm_module.inventory._inventory[hostname]['status'] == 'maintenance'
+
      @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
      @mock.patch("cephadm.module.HostCache.get_daemon_types")
      @mock.patch("cephadm.module.HostCache.get_hosts")
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py

index 47009f4f4151fb1847fc95e78a4f0da6f73b3d5b..934c8742659c2f199bb19ffd4f38f81c95a381b8 100644 (file)
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -406,7 +406,7 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def enter_host_maintenance(self, hostname: str, force: bool = False) -> OrchResult:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult:
          """
          Place a host in maintenance, stopping daemons and disabling it's systemd target
          """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py

index 73bd137484db6eb7e39c52097aee5e31bddd2238..d4828bc37fea8a9cde7f3a4427e0ade0e3205884 100644 (file)
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -437,11 +437,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
          return HandleCommandResult(stdout=completion.result_str())
  
      @_cli_write_command('orch host maintenance enter')
-    def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
+    def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
          """
          Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
          """
-        completion = self.enter_host_maintenance(hostname, force=force)
+        completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it)
          raise_if_exception(completion)
  
          return HandleCommandResult(stdout=completion.result_str())
author	Adam King <adking@redhat.com>
	Wed, 1 Mar 2023 15:58:43 +0000 (10:58 -0500)
committer	Adam King <adking@redhat.com>
	Wed, 5 Apr 2023 19:37:19 +0000 (15:37 -0400)
doc/cephadm/host-management.rst		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/test_cephadm.py		patch \| blob \| history
src/pybind/mgr/orchestrator/_interface.py		patch \| blob \| history
src/pybind/mgr/orchestrator/module.py		patch \| blob \| history