mgr/cephadm: add offline host handling to maintenance exit

author Adam King <adking@redhat.com>

Wed, 2 Oct 2024 20:04:22 +0000 (16:04 -0400)

committer Adam King <adking@redhat.com>

Wed, 16 Oct 2024 17:52:27 +0000 (13:52 -0400)
author Adam King <adking@redhat.com>
Wed, 2 Oct 2024 20:04:22 +0000 (16:04 -0400)
committer Adam King <adking@redhat.com>
Wed, 16 Oct 2024 17:52:27 +0000 (13:52 -0400)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 565067f32afc1d910b252063deee3eb9fba1836b..56bebf1387c8fd73c45ba5a7249cc1ca6a5b0bb5 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -2272,7 +2272,7 @@ Then run the following:
  
      @handle_orch_error
      @host_exists()
-    def exit_host_maintenance(self, hostname: str) -> str:
+    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> str:
          """Exit maintenance mode and return a host to an operational state
  
          Returning from maintenance will enable the clusters systemd target and
@@ -2280,6 +2280,8 @@ Then run the following:
          host has osd daemons
  
          :param hostname: (str) host name
+        :param force: (bool) force removal of the host from maintenance mode
+        :param offline: (bool) to remove hosts that are offline from maintenance mode
  
          :raises OrchestratorError: Unable to return from maintenance, or unset the
                                     noout flag
@@ -2288,30 +2290,67 @@ Then run the following:
          if tgt_host['status'] != "maintenance":
              raise OrchestratorError(f"Host {hostname} is not in maintenance mode")
  
-        with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
-            outs, errs, _code = self.wait_async(
-                CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
-                                                'host-maintenance', ['exit'], error_ok=True))
-        returned_msg = errs[0].split('\n')[-1]
-        if returned_msg.startswith('failed') or returned_msg.startswith('ERROR'):
-            raise OrchestratorError(
-                f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
-
-        if "osd" in self.cache.get_daemon_types(hostname):
-            crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
-            rc, _out, _err = self.mon_command({
-                'prefix': 'osd unset-group',
-                'flags': 'noout',
-                'who': [crush_node],
-                'format': 'json'
-            })
-            if rc:
+        # Given we do not regularly check maintenance mode hosts for being offline,
+        # we have no idea at this point whether the host is online or not.
+        # Keep in mind this goes both ways, as users could have run
+        # "ceph cephadm check-host <hostname>" when the host was in maintenance
+        # mode and offline and the host could have since come online. This following
+        # "cephadm check-host" command is being run purely so we know if the host
+        # is online or offline, as those should be handled differently
+        try:
+            with self.async_timeout_handler(hostname, 'cephadm check-host'):
+                outs, errs, _code = self.wait_async(
+                    CephadmServe(self)._run_cephadm(
+                        hostname, cephadmNoImage,
+                        'check-host', [], error_ok=False
+                    )
+                )
+        except OrchestratorError:
+            pass
+
+        host_offline = hostname in self.offline_hosts
+
+        if host_offline and not offline:
+            raise OrchestratorValidationError(
+                f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode')
+
+        if not host_offline and offline:
+            raise OrchestratorValidationError(
+                f'{hostname} is online, please take host out of maintenance mode without --offline.')
+
+        if offline and not force:
+            raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force")
+
+        # no point trying these parts if we know the host is offline
+        if not host_offline:
+            with self.async_timeout_handler(hostname, 'cephadm host-maintenance exit'):
+                outs, errs, _code = self.wait_async(
+                    CephadmServe(self)._run_cephadm(hostname, cephadmNoImage,
+                                                    'host-maintenance', ['exit'], error_ok=True))
+            returned_msg = errs[0].split('\n')[-1]
+            if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')):
                  self.log.warning(
-                    f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
-                raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
-            else:
-                self.log.info(
-                    f"exit maintenance request has UNSET for the noout group on host {hostname}")
+                    f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+                if not force:
+                    raise OrchestratorError(
+                        f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+
+            if "osd" in self.cache.get_daemon_types(hostname):
+                crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
+                rc, _out, _err = self.mon_command({
+                    'prefix': 'osd unset-group',
+                    'flags': 'noout',
+                    'who': [crush_node],
+                    'format': 'json'
+                })
+                if rc:
+                    self.log.warning(
+                        f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
+                    if not force:
+                        raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
+                else:
+                    self.log.info(
+                        f"exit maintenance request has UNSET for the noout group on host {hostname}")
  
          # update the host record status
          tgt_host['status'] = ""
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py

index acf3b0076d01dbb10f9acb78e98bba8efb5b8264..80e056f3cc905c92d1ac1b4a4f7a1162e1444e93 100644 (file)
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -503,7 +503,7 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def exit_host_maintenance(self, hostname: str) -> OrchResult:
+    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult:
          """
          Return a host from maintenance, restarting the clusters systemd target
          """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py

index 65460f73ee1c83650400db73cd93e83f8e753399..023ec3b0b6a4fe685bf990983198d62bac904915 100644 (file)
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -799,11 +799,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
          return HandleCommandResult(stdout=completion.result_str())
  
      @_cli_write_command('orch host maintenance exit')
-    def _host_maintenance_exit(self, hostname: str) -> HandleCommandResult:
+    def _host_maintenance_exit(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult:
          """
          Return a host from maintenance, restarting all Ceph daemons (cephadm only)
          """
-        completion = self.exit_host_maintenance(hostname)
+        completion = self.exit_host_maintenance(hostname, force, offline)
          raise_if_exception(completion)
  
          return HandleCommandResult(stdout=completion.result_str())
author	Adam King <adking@redhat.com>
	Wed, 2 Oct 2024 20:04:22 +0000 (16:04 -0400)
committer	Adam King <adking@redhat.com>
	Wed, 16 Oct 2024 17:52:27 +0000 (13:52 -0400)
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/orchestrator/_interface.py		patch \| blob \| history
src/pybind/mgr/orchestrator/module.py		patch \| blob \| history