cephadm: Provide appropriate exit codes for orch operations

author Kushal Deb <Kushal.Deb@ibm.com>

Tue, 25 Mar 2025 04:48:05 +0000 (10:18 +0530)

committer Kushal Deb <Kushal.Deb@ibm.com>

Thu, 22 May 2025 04:47:46 +0000 (10:17 +0530)
author Kushal Deb <Kushal.Deb@ibm.com>
Tue, 25 Mar 2025 04:48:05 +0000 (10:18 +0530)
committer Kushal Deb <Kushal.Deb@ibm.com>
Thu, 22 May 2025 04:47:46 +0000 (10:17 +0530)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 1a5389d026083707205548227b8eb6123896571c..7609ae099c8cfed83e584f05ceb6dfadf6eb6cf2 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -2093,12 +2093,13 @@ Then run the following:
  
          notifications: List[str] = []
          error_notifications: List[str] = []
-        okay: bool = True
+        initial_rc = 0
          for daemon_type, daemon_ids in daemon_map.items():
              r = service_registry.get_service(daemon_type_to_service(
                  daemon_type)).ok_to_stop(daemon_ids, force=force)
              if r.retval:
-                okay = False
+                initial_rc = r.retval if initial_rc == 0 else initial_rc
+
                  # collect error notifications so user can see every daemon causing host
                  # to not be okay to stop
                  error_notifications.append(r.stderr)
@@ -2106,9 +2107,9 @@ Then run the following:
                  # if extra notifications to print for user, add them to notifications list
                  notifications.append(r.stdout)
  
-        if not okay:
+        if initial_rc:
              # at least one daemon is not okay to stop
-            return 1, '\n'.join(error_notifications)
+            return initial_rc, '\n'.join(error_notifications)
  
          if notifications:
              return 0, (f'It is presumed safe to stop host {hostname}. '
@@ -2118,7 +2119,7 @@ Then run the following:
      @handle_orch_error
      def host_ok_to_stop(self, hostname: str) -> str:
          if hostname not in self.cache.get_hosts():
-            raise OrchestratorError(f'Cannot find host "{hostname}"')
+            raise OrchestratorError(f'Cannot find host "{hostname}"', errno=errno.EINVAL)
  
          rc, msg = self._host_ok_to_stop(hostname)
          if rc:
@@ -2152,20 +2153,20 @@ Then run the following:
  
          :raises OrchestratorError: Hostname is invalid, host is already in maintenance
          """
+
          if yes_i_really_mean_it and not force:
-            raise OrchestratorError("--force must be passed with --yes-i-really-mean-it")
+            raise OrchestratorError("force must be passed with --yes-i-really-mean-it", errno=errno.EINVAL)
  
          if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it:
-            raise OrchestratorError("Maintenance feature is not supported on single node clusters")
+            raise OrchestratorError("Maintenance feature is not supported on single node clusters", errno=errno.ENOTSUP)
  
          # if upgrade is active, deny
          if self.upgrade.upgrade_state and not yes_i_really_mean_it:
-            raise OrchestratorError(
-                f"Unable to place {hostname} in maintenance with upgrade active/paused")
+            raise OrchestratorError(f"Unabe to place {hostname} in maintenance with upgrade active/paused", errno=errno.EAGAIN)
  
          tgt_host = self.inventory._inventory[hostname]
          if tgt_host.get("status", "").lower() == "maintenance":
-            raise OrchestratorError(f"Host {hostname} is already in maintenance")
+            raise OrchestratorError(f"Host {hostname} is already in maintenance", errno=errno.EALREADY)
  
          host_daemons = self.cache.get_daemon_types(hostname)
          self.log.debug("daemons on host {}".format(','.join(host_daemons)))
@@ -2186,9 +2187,7 @@ Then run the following:
                          error_ok=True))
              returned_msg = _err[0].split('\n')[-1]
              if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it:
-                raise OrchestratorError(
-                    f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}")
-
+                raise OrchestratorError(f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}", errno=errno.EPERM)
              if "osd" in host_daemons:
                  crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
                  rc, out, err = self.mon_command({
@@ -2201,7 +2200,7 @@ Then run the following:
                      self.log.warning(
                          f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})")
                      raise OrchestratorError(
-                        f"Unable to set the osds on {hostname} to noout (rc={rc})")
+                        f"Unable to set the osds on {hostname} to noout (rc={rc})", errno=errno.EIO)
                  elif not rc:
                      self.log.info(
                          f"maintenance mode request for {hostname} has SET the noout group")
@@ -2232,7 +2231,7 @@ Then run the following:
          """
          tgt_host = self.inventory._inventory[hostname]
          if tgt_host['status'] != "maintenance":
-            raise OrchestratorError(f"Host {hostname} is not in maintenance mode")
+            raise OrchestratorError(f"Host {hostname} is not in maintenance mode", errno=errno.EINVAL)
  
          # Given we do not regularly check maintenance mode hosts for being offline,
          # we have no idea at this point whether the host is online or not.
@@ -2256,14 +2255,14 @@ Then run the following:
  
          if host_offline and not offline:
              raise OrchestratorValidationError(
-                f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode')
+                f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode', errno=errno.ENOTSUP)
  
          if not host_offline and offline:
              raise OrchestratorValidationError(
-                f'{hostname} is online, please take host out of maintenance mode without --offline.')
+                f'{hostname} is online, please take host out of maintenance mode without --offline.', errno=errno.EAGAIN)
  
          if offline and not force:
-            raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force")
+            raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force", errno=errno.EAGAIN)
  
          # no point trying these parts if we know the host is offline
          if not host_offline:
@@ -2277,7 +2276,7 @@ Then run the following:
                      f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
                  if not force:
                      raise OrchestratorError(
-                        f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}")
+                        f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}", errno=errno.EPERM)
  
              if "osd" in self.cache.get_daemon_types(hostname):
                  crush_node = hostname if '.' not in hostname else hostname.split('.')[0]
@@ -2291,7 +2290,7 @@ Then run the following:
                      self.log.warning(
                          f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})")
                      if not force:
-                        raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})")
+                        raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})", errno=errno.EIO)
                  else:
                      self.log.info(
                          f"exit maintenance request has UNSET for the noout group on host {hostname}")
@@ -2720,7 +2719,7 @@ Then run the following:
                  if name in dm:
                      args.append((name, host))
          if not args:
-            raise OrchestratorError('Unable to find daemon(s) %s' % (names))
+            raise OrchestratorError('Unable to find daemon(s) %s' % (names), errno=errno.EINVAL)
          self.log.info('Remove daemons %s' % ' '.join([a[0] for a in args]))
          return self._remove_daemons(args)
  
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py

index 0fc103344e39f0e5af7ab1fe677d7f0874455752..d2691a1a0aeb32773f375d9409a1e4467b235b9f 100644 (file)
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -74,7 +74,7 @@ class OrchestratorError(Exception):
                   errno: int = -errno.EINVAL,
                   event_kind_subject: Optional[Tuple[str, str]] = None) -> None:
          super(Exception, self).__init__(msg)
-        self.errno = errno
+        self.errno = abs(errno)
          # See OrchestratorEvent.subject
          self.event_subject = event_kind_subject
  
@@ -111,12 +111,12 @@ def handle_exception(prefix: str, perm: str, func: FuncT) -> FuncT:
              return func(*args, **kwargs)
          except (OrchestratorError, SpecValidationError) as e:
              # Do not print Traceback for expected errors.
-            return HandleCommandResult(e.errno, stderr=str(e))
+            return HandleCommandResult(retval=e.errno, stderr=str(e))
          except ImportError as e:
-            return HandleCommandResult(-errno.ENOENT, stderr=str(e))
+            return HandleCommandResult(retval=-errno.ENOENT, stderr=str(e))
          except NotImplementedError:
              msg = 'This Orchestrator does not support `{}`'.format(prefix)
-            return HandleCommandResult(-errno.ENOENT, stderr=msg)
+            return HandleCommandResult(retval=-errno.ENOENT, stderr=msg)
  
      # misuse lambda to copy `wrapper`
      wrapper_copy = lambda *l_args, **l_kwargs: wrapper(*l_args, **l_kwargs)  # noqa: E731
@@ -245,6 +245,25 @@ def raise_if_exception(c: OrchResult[T]) -> T:
      return c.result
  
  
+def completion_to_result(c: OrchResult[T]) -> HandleCommandResult:
+    """
+    Converts an OrchResult to a HandleCommandResult,
+    preserving output and error codes.
+    """
+    if c.serialized_exception is None:
+        assert c.result is not None, "OrchResult should either have result or an exception"
+        return HandleCommandResult(stdout=c.result_str())
+
+    try:
+        e = pickle.loads(c.serialized_exception)
+    except (KeyError, AttributeError):
+        return HandleCommandResult(stderr=c.exception_str, retval=errno.EIO)
+    if isinstance(e, OrchestratorError):
+        return HandleCommandResult(stderr=str(e), retval=-e.errno)
+
+    raise e
+
+
  def _hide_in_features(f: FuncT) -> FuncT:
      f._hide_in_features = True  # type: ignore
      return f
@@ -489,7 +508,7 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def host_ok_to_stop(self, hostname: str) -> OrchResult:
+    def host_ok_to_stop(self, hostname: str) -> OrchResult[str]:
          """
          Check if the specified host can be safely stopped without reducing availability
  
@@ -497,13 +516,13 @@ class Orchestrator(object):
          """
          raise NotImplementedError()
  
-    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult:
+    def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult[str]:
          """
          Place a host in maintenance, stopping daemons and disabling it's systemd target
          """
          raise NotImplementedError()
  
-    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult:
+    def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult[str]:
          """
          Return a host from maintenance, restarting the clusters systemd target
          """
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py

index 151616c84860d8c4c69d47812db4684bd18c2310..ddef2d13197c4ce8f732744dcf6c6b2843879b89 100644 (file)
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -55,6 +55,7 @@ from ._interface import (
      _cli_write_command,
      json_to_generic_spec,
      raise_if_exception,
+    completion_to_result,
  )
  
  
@@ -786,8 +787,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
      def _host_ok_to_stop(self, hostname: str) -> HandleCommandResult:
          """Check if the specified host can be safely stopped without reducing availability"""""
          completion = self.host_ok_to_stop(hostname)
-        raise_if_exception(completion)
-        return HandleCommandResult(stdout=completion.result_str())
+        return completion_to_result(completion)
  
      @_cli_write_command('orch host maintenance enter')
      def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult:
@@ -795,9 +795,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
          Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
          """
          completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it)
-        raise_if_exception(completion)
-
-        return HandleCommandResult(stdout=completion.result_str())
+        return completion_to_result(completion)
  
      @_cli_write_command('orch host maintenance exit')
      def _host_maintenance_exit(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult:
@@ -805,9 +803,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
          Return a host from maintenance, restarting all Ceph daemons (cephadm only)
          """
          completion = self.exit_host_maintenance(hostname, force, offline)
-        raise_if_exception(completion)
-
-        return HandleCommandResult(stdout=completion.result_str())
+        return completion_to_result(completion)
  
      @_cli_write_command('orch host rescan')
      def _host_rescan(self, hostname: str, with_summary: bool = False) -> HandleCommandResult:
@@ -1783,14 +1779,12 @@ Usage:
          """Remove specific daemon(s)"""
          for name in names:
              if '.' not in name:
-                raise OrchestratorError('%s is not a valid daemon name' % name)
+                return HandleCommandResult(stderr=f"{name} is not a valid daemon name", retval=-errno.EINVAL)
              (daemon_type) = name.split('.')[0]
              if not force and daemon_type in ['osd', 'mon', 'prometheus']:
-                raise OrchestratorError(
-                    'must pass --force to REMOVE daemon with potentially PRECIOUS DATA for %s' % name)
+                return HandleCommandResult(stderr=f"must pass --force to REMOVE daemon with potentially PRECIOUS DATA for {name}", retval=-errno.EPERM)
          completion = self.remove_daemons(names)
-        raise_if_exception(completion)
-        return HandleCommandResult(stdout=completion.result_str())
+        return completion_to_result(completion)
  
      @_cli_write_command('orch rm')
      def _service_rm(self,
diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py

index 079e8d7f84bbd72048277cd27a5d1d290dc58aac..df662f4ad9e175f9f0739f853b27262e2e85e65c 100644 (file)
--- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
+++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
@@ -143,7 +143,7 @@ def test_handle_command():
      m = OrchestratorCli('orchestrator', 0, 0)
      r = m._handle_command(None, cmd)
      assert r == HandleCommandResult(
-        retval=-2, stdout='', stderr='No orchestrator configured (try `ceph orch set backend`)')
+        retval=2, stdout='', stderr='No orchestrator configured (try `ceph orch set backend`)')
  
  
  r = OrchResult([ServiceDescription(spec=ServiceSpec(service_type='osd'), running=123)])
author	Kushal Deb <Kushal.Deb@ibm.com>
	Tue, 25 Mar 2025 04:48:05 +0000 (10:18 +0530)
committer	Kushal Deb <Kushal.Deb@ibm.com>
	Thu, 22 May 2025 04:47:46 +0000 (10:17 +0530)
src/pybind/mgr/cephadm/module.py		patch \| blob \| history
src/pybind/mgr/orchestrator/_interface.py		patch \| blob \| history
src/pybind/mgr/orchestrator/module.py		patch \| blob \| history
src/pybind/mgr/orchestrator/tests/test_orchestrator.py		patch \| blob \| history