@handle_orch_error
@host_exists()
- def exit_host_maintenance(self, hostname: str) -> str:
+ def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> str:
"""Exit maintenance mode and return a host to an operational state
Returning from maintenance will enable the clusters systemd target and
host has osd daemons
:param hostname: (str) host name
+ :param force: (bool) force removal of the host from maintenance mode
+ :param offline: (bool) to remove hosts that are offline from maintenance mode
:raises OrchestratorError: Unable to return from maintenance, or unset the
noout flag
if tgt_host['status'] != "maintenance":
raise OrchestratorError(f"Host {hostname} is not in maintenance mode")
- with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
- outs, errs, _code = self.wait_async(
- CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
- 'host-maintenance', ['exit'], error_ok=True))
- returned_msg = errs[0].split('\n')[-1]
- if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
- raise OrchestratorError(
- f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
-
- if "osd" in self.cache.get_daemon_types(hostname):
- crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
- rc, _out, _err = self.mon_command({
- 'prefix': 'osd unset-group',
- 'flags': 'noout',
- 'who': [crush_node],
- 'format': 'json'
- })
- if rc:
+ # Given we do not regularly check maintenance mode hosts for being offline,
+ # we have no idea at this point whether the host is online or not.
+ # Keep in mind this goes both ways, as users could have run
+ # "ceph cephadm check-host <hostname>" when the host was in maintenance
+ # mode and offline and the host could have since come online. This following
+ # "cephadm check-host" command is being run purely so we know if the host
+ # is online or offline, as those should be handled differently
+ try:
+ with self.async_timeout_handler(hostname, 'cephadm check-host'):
+ outs, errs, _code = self.wait_async(
+ CephadmServe(self)._run_cephadm(
+ hostname, cephadmNoImage,
+ 'check-host', [], error_ok=False
+ )
+ )
+ except OrchestratorError:
+ pass
+
+ host_offline = hostname in self.offline_hosts
+
+ if host_offline and not offline:
+ raise OrchestratorValidationError(
+ f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode')
+
+ if not host_offline and offline:
+ raise OrchestratorValidationError(
+ f'{hostname} is online, please take host out of maintenance mode without --offline.')
+
+ if offline and not force:
+ raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force")
+
+ # no point trying these parts if we know the host is offline
+ if not host_offline:
+ with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
+ outs, errs, _code = self.wait_async(
+ CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
+ 'host-maintenance', ['exit'], error_ok=True))
+ returned_msg = errs[0].split('\n')[-1]
+ if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')):
self.log.warning(
- f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
- raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
- else:
- self.log.info(
- f"exit maintenance request has UNSET for the noout group on host {hostname}")
+ f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+ if not force:
+ raise OrchestratorError(
+ f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+
+ if "osd" in self.cache.get_daemon_types(hostname):
+ crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
+ rc, _out, _err = self.mon_command({
+ 'prefix': 'osd unset-group',
+ 'flags': 'noout',
+ 'who': [crush_node],
+ 'format': 'json'
+ })
+ if rc:
+ self.log.warning(
+ f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
+ if not force:
+ raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
+ else:
+ self.log.info(
+ f"exit maintenance request has UNSET for the noout group on host {hostname}")
# update the host record status
tgt_host['status'] = ""