Right now, we have safety checks that will either say things are okay to stop,
return warnings, or returns "alerts". Warnings can be bypassed already with
the --force flag that exists for the command. However, the alerts cannot be
bypassed and cephadm will refuse to attempt to put the host in maintenance mode.
The idea here is for an additional flag that also bypasses that alerts, in cases
where a user really needs to put the host into maintenance mode even if that
means causing issues within the cluster.
Fixes: https://tracker.ceph.com/issues/58892
Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit
0a980666f645387f1eaa5f2c995db304c42e85e6)
.. prompt:: bash #
- ceph orch host maintenance enter <hostname> [--force]
+ ceph orch host maintenance enter <hostname> [--force] [--yes-i-really-mean-it]
ceph orch host maintenance exit <hostname>
-Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
+The ``--force`` flag allows the user to bypass warnings (but not alerts). The ``--yes-i-really-mean-it``
+flag bypasses all safety checks and will attempt to force the host into maintenance mode no
+matter what.
+
+.. warning:: Using the --yes-i-really-mean-it flag to force the host to enter maintenance
+ mode can potentially cause loss of data availability, the mon quorum to break down due
+ to too few running monitors, mgr module commands (such as ``ceph orch . . .`` commands)
+ to be become unresponsive, and a number of other possible issues. Please only use this
+ flag if you're absolutely certain you know what you're doing.
See also :ref:`cephadm-fqdn`
@handle_orch_error
@host_exists()
- def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
+ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> str:
""" Attempt to place a cluster host in maintenance
Placing a host into maintenance disables the cluster's ceph target in systemd
:raises OrchestratorError: Hostname is invalid, host is already in maintenance
"""
- if len(self.cache.get_hosts()) == 1:
+ if yes_i_really_mean_it and not force:
+ raise OrchestratorError("--force must be passed with --yes-i-really-mean-it")
+
+ if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it:
raise OrchestratorError("Maintenance feature is not supported on single node clusters")
# if upgrade is active, deny
- if self.upgrade.upgrade_state:
+ if self.upgrade.upgrade_state and not yes_i_really_mean_it:
raise OrchestratorError(
f"Unable to place {hostname} in maintenance with upgrade active/paused")
# daemons on this host, so check the daemons can be stopped
# and if so, place the host into maintenance by disabling the target
rc, msg = self._host_ok_to_stop(hostname, force)
- if rc:
+ if rc and not yes_i_really_mean_it:
raise OrchestratorError(
msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
["enter"],
error_ok=True))
returned_msg = _err[0].split('\n')[-1]
- if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
+ if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it:
raise OrchestratorError(
f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}")
'who': [crush_node],
'format': 'json'
})
- if rc:
+ if rc and not yes_i_really_mean_it:
self.log.warning(
f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})")
raise OrchestratorError(
f"Unable to set the osds on {hostname} to noout (rc={rc})")
- else:
+ elif not rc:
self.log.info(
f"maintenance mode request for {hostname} has SET the noout group")
assert not cephadm_module.inventory._inventory[hostname]['status']
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+ @mock.patch("cephadm.CephadmOrchestrator._host_ok_to_stop")
+ @mock.patch("cephadm.module.HostCache.get_daemon_types")
+ @mock.patch("cephadm.module.HostCache.get_hosts")
+ def test_maintenance_enter_i_really_mean_it(self, _hosts, _get_daemon_types, _host_ok, _run_cephadm, cephadm_module: CephadmOrchestrator):
+ hostname = 'host1'
+ err_str = 'some kind of error'
+ _run_cephadm.side_effect = async_side_effect(
+ ([''], ['something\nfailed - disable the target'], 0))
+ _host_ok.return_value = 1, err_str
+ _get_daemon_types.return_value = ['mon']
+ _hosts.return_value = [hostname, 'other_host']
+ cephadm_module.inventory.add_host(HostSpec(hostname))
+
+ with pytest.raises(OrchestratorError, match=err_str):
+ cephadm_module.enter_host_maintenance(hostname)
+ assert not cephadm_module.inventory._inventory[hostname]['status']
+
+ with pytest.raises(OrchestratorError, match=err_str):
+ cephadm_module.enter_host_maintenance(hostname, force=True)
+ assert not cephadm_module.inventory._inventory[hostname]['status']
+
+ retval = cephadm_module.enter_host_maintenance(hostname, force=True, yes_i_really_mean_it=True)
+ assert retval.result_str().startswith('Daemons for Ceph cluster')
+ assert not retval.exception_str
+ assert cephadm_module.inventory._inventory[hostname]['status'] == 'maintenance'
+
@mock.patch("cephadm.serve.CephadmServe._run_cephadm")
@mock.patch("cephadm.module.HostCache.get_daemon_types")
@mock.patch("cephadm.module.HostCache.get_hosts")
"""
raise NotImplementedError()
- def enter_host_maintenance(self, hostname: str, force: bool = False) -> OrchResult:
+ def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult:
"""
Place a host in maintenance, stopping daemons and disabling it's systemd target
"""
return HandleCommandResult(stdout=completion.result_str())
@_cli_write_command('orch host maintenance enter')
- def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
+ def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
"""
Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
"""
- completion = self.enter_host_maintenance(hostname, force=force)
+ completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it)
raise_if_exception(completion)
return HandleCommandResult(stdout=completion.result_str())