From: Kushal Deb Date: Tue, 25 Mar 2025 04:48:05 +0000 (+0530) Subject: cephadm: Provide appropriate exit codes for orch operations X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f9ba2e9d5f8cf95ca266156610d404f1a5fea58f;p=ceph.git cephadm: Provide appropriate exit codes for orch operations Signed-off-by: Kushal Deb --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1a5389d0260..7609ae099c8 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2093,12 +2093,13 @@ Then run the following: notifications: List[str] = [] error_notifications: List[str] = [] - okay: bool = True + initial_rc = 0 for daemon_type, daemon_ids in daemon_map.items(): r = service_registry.get_service(daemon_type_to_service( daemon_type)).ok_to_stop(daemon_ids, force=force) if r.retval: - okay = False + initial_rc = r.retval if initial_rc == 0 else initial_rc + # collect error notifications so user can see every daemon causing host # to not be okay to stop error_notifications.append(r.stderr) @@ -2106,9 +2107,9 @@ Then run the following: # if extra notifications to print for user, add them to notifications list notifications.append(r.stdout) - if not okay: + if initial_rc: # at least one daemon is not okay to stop - return 1, '\n'.join(error_notifications) + return initial_rc, '\n'.join(error_notifications) if notifications: return 0, (f'It is presumed safe to stop host {hostname}. ' @@ -2118,7 +2119,7 @@ Then run the following: @handle_orch_error def host_ok_to_stop(self, hostname: str) -> str: if hostname not in self.cache.get_hosts(): - raise OrchestratorError(f'Cannot find host "{hostname}"') + raise OrchestratorError(f'Cannot find host "{hostname}"', errno=errno.EINVAL) rc, msg = self._host_ok_to_stop(hostname) if rc: @@ -2152,20 +2153,20 @@ Then run the following: :raises OrchestratorError: Hostname is invalid, host is already in maintenance """ + if yes_i_really_mean_it and not force: - raise OrchestratorError("--force must be passed with --yes-i-really-mean-it") + raise OrchestratorError("force must be passed with --yes-i-really-mean-it", errno=errno.EINVAL) if len(self.cache.get_hosts()) == 1 and not yes_i_really_mean_it: - raise OrchestratorError("Maintenance feature is not supported on single node clusters") + raise OrchestratorError("Maintenance feature is not supported on single node clusters", errno=errno.ENOTSUP) # if upgrade is active, deny if self.upgrade.upgrade_state and not yes_i_really_mean_it: - raise OrchestratorError( - f"Unable to place {hostname} in maintenance with upgrade active/paused") + raise OrchestratorError(f"Unabe to place {hostname} in maintenance with upgrade active/paused", errno=errno.EAGAIN) tgt_host = self.inventory._inventory[hostname] if tgt_host.get("status", "").lower() == "maintenance": - raise OrchestratorError(f"Host {hostname} is already in maintenance") + raise OrchestratorError(f"Host {hostname} is already in maintenance", errno=errno.EALREADY) host_daemons = self.cache.get_daemon_types(hostname) self.log.debug("daemons on host {}".format(','.join(host_daemons))) @@ -2186,9 +2187,7 @@ Then run the following: error_ok=True)) returned_msg = _err[0].split('\n')[-1] if (returned_msg.startswith('failed') or returned_msg.startswith('ERROR')) and not yes_i_really_mean_it: - raise OrchestratorError( - f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}") - + raise OrchestratorError(f"Failed to place {hostname} into maintenance for cluster {self._cluster_fsid}", errno=errno.EPERM) if "osd" in host_daemons: crush_node = hostname if '.' not in hostname else hostname.split('.')[0] rc, out, err = self.mon_command({ @@ -2201,7 +2200,7 @@ Then run the following: self.log.warning( f"maintenance mode request for {hostname} failed to SET the noout group (rc={rc})") raise OrchestratorError( - f"Unable to set the osds on {hostname} to noout (rc={rc})") + f"Unable to set the osds on {hostname} to noout (rc={rc})", errno=errno.EIO) elif not rc: self.log.info( f"maintenance mode request for {hostname} has SET the noout group") @@ -2232,7 +2231,7 @@ Then run the following: """ tgt_host = self.inventory._inventory[hostname] if tgt_host['status'] != "maintenance": - raise OrchestratorError(f"Host {hostname} is not in maintenance mode") + raise OrchestratorError(f"Host {hostname} is not in maintenance mode", errno=errno.EINVAL) # Given we do not regularly check maintenance mode hosts for being offline, # we have no idea at this point whether the host is online or not. @@ -2256,14 +2255,14 @@ Then run the following: if host_offline and not offline: raise OrchestratorValidationError( - f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode') + f'{hostname} is offline, please use --offline and --force to take this host out of maintenance mode', errno=errno.ENOTSUP) if not host_offline and offline: raise OrchestratorValidationError( - f'{hostname} is online, please take host out of maintenance mode without --offline.') + f'{hostname} is online, please take host out of maintenance mode without --offline.', errno=errno.EAGAIN) if offline and not force: - raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force") + raise OrchestratorValidationError("Taking an offline host out of maintenance mode requires --force", errno=errno.EAGAIN) # no point trying these parts if we know the host is offline if not host_offline: @@ -2277,7 +2276,7 @@ Then run the following: f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}") if not force: raise OrchestratorError( - f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}") + f"Failed to exit maintenance state for host {hostname}, cluster {self._cluster_fsid}", errno=errno.EPERM) if "osd" in self.cache.get_daemon_types(hostname): crush_node = hostname if '.' not in hostname else hostname.split('.')[0] @@ -2291,7 +2290,7 @@ Then run the following: self.log.warning( f"exit maintenance request failed to UNSET the noout group for {hostname}, (rc={rc})") if not force: - raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})") + raise OrchestratorError(f"Unable to set the osds on {hostname} to noout (rc={rc})", errno=errno.EIO) else: self.log.info( f"exit maintenance request has UNSET for the noout group on host {hostname}") @@ -2720,7 +2719,7 @@ Then run the following: if name in dm: args.append((name, host)) if not args: - raise OrchestratorError('Unable to find daemon(s) %s' % (names)) + raise OrchestratorError('Unable to find daemon(s) %s' % (names), errno=errno.EINVAL) self.log.info('Remove daemons %s' % ' '.join([a[0] for a in args])) return self._remove_daemons(args) diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 0fc103344e3..d2691a1a0ae 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -74,7 +74,7 @@ class OrchestratorError(Exception): errno: int = -errno.EINVAL, event_kind_subject: Optional[Tuple[str, str]] = None) -> None: super(Exception, self).__init__(msg) - self.errno = errno + self.errno = abs(errno) # See OrchestratorEvent.subject self.event_subject = event_kind_subject @@ -111,12 +111,12 @@ def handle_exception(prefix: str, perm: str, func: FuncT) -> FuncT: return func(*args, **kwargs) except (OrchestratorError, SpecValidationError) as e: # Do not print Traceback for expected errors. - return HandleCommandResult(e.errno, stderr=str(e)) + return HandleCommandResult(retval=e.errno, stderr=str(e)) except ImportError as e: - return HandleCommandResult(-errno.ENOENT, stderr=str(e)) + return HandleCommandResult(retval=-errno.ENOENT, stderr=str(e)) except NotImplementedError: msg = 'This Orchestrator does not support `{}`'.format(prefix) - return HandleCommandResult(-errno.ENOENT, stderr=msg) + return HandleCommandResult(retval=-errno.ENOENT, stderr=msg) # misuse lambda to copy `wrapper` wrapper_copy = lambda *l_args, **l_kwargs: wrapper(*l_args, **l_kwargs) # noqa: E731 @@ -245,6 +245,25 @@ def raise_if_exception(c: OrchResult[T]) -> T: return c.result +def completion_to_result(c: OrchResult[T]) -> HandleCommandResult: + """ + Converts an OrchResult to a HandleCommandResult, + preserving output and error codes. + """ + if c.serialized_exception is None: + assert c.result is not None, "OrchResult should either have result or an exception" + return HandleCommandResult(stdout=c.result_str()) + + try: + e = pickle.loads(c.serialized_exception) + except (KeyError, AttributeError): + return HandleCommandResult(stderr=c.exception_str, retval=errno.EIO) + if isinstance(e, OrchestratorError): + return HandleCommandResult(stderr=str(e), retval=-e.errno) + + raise e + + def _hide_in_features(f: FuncT) -> FuncT: f._hide_in_features = True # type: ignore return f @@ -489,7 +508,7 @@ class Orchestrator(object): """ raise NotImplementedError() - def host_ok_to_stop(self, hostname: str) -> OrchResult: + def host_ok_to_stop(self, hostname: str) -> OrchResult[str]: """ Check if the specified host can be safely stopped without reducing availability @@ -497,13 +516,13 @@ class Orchestrator(object): """ raise NotImplementedError() - def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult: + def enter_host_maintenance(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> OrchResult[str]: """ Place a host in maintenance, stopping daemons and disabling it's systemd target """ raise NotImplementedError() - def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult: + def exit_host_maintenance(self, hostname: str, force: bool = False, offline: bool = False) -> OrchResult[str]: """ Return a host from maintenance, restarting the clusters systemd target """ diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 151616c8486..ddef2d13197 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -55,6 +55,7 @@ from ._interface import ( _cli_write_command, json_to_generic_spec, raise_if_exception, + completion_to_result, ) @@ -786,8 +787,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, def _host_ok_to_stop(self, hostname: str) -> HandleCommandResult: """Check if the specified host can be safely stopped without reducing availability""""" completion = self.host_ok_to_stop(hostname) - raise_if_exception(completion) - return HandleCommandResult(stdout=completion.result_str()) + return completion_to_result(completion) @_cli_write_command('orch host maintenance enter') def _host_maintenance_enter(self, hostname: str, force: bool = False, yes_i_really_mean_it: bool = False) -> HandleCommandResult: @@ -795,9 +795,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only) """ completion = self.enter_host_maintenance(hostname, force=force, yes_i_really_mean_it=yes_i_really_mean_it) - raise_if_exception(completion) - - return HandleCommandResult(stdout=completion.result_str()) + return completion_to_result(completion) @_cli_write_command('orch host maintenance exit') def _host_maintenance_exit(self, hostname: str, force: bool = False, offline: bool = False) -> HandleCommandResult: @@ -805,9 +803,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, Return a host from maintenance, restarting all Ceph daemons (cephadm only) """ completion = self.exit_host_maintenance(hostname, force, offline) - raise_if_exception(completion) - - return HandleCommandResult(stdout=completion.result_str()) + return completion_to_result(completion) @_cli_write_command('orch host rescan') def _host_rescan(self, hostname: str, with_summary: bool = False) -> HandleCommandResult: @@ -1783,14 +1779,12 @@ Usage: """Remove specific daemon(s)""" for name in names: if '.' not in name: - raise OrchestratorError('%s is not a valid daemon name' % name) + return HandleCommandResult(stderr=f"{name} is not a valid daemon name", retval=-errno.EINVAL) (daemon_type) = name.split('.')[0] if not force and daemon_type in ['osd', 'mon', 'prometheus']: - raise OrchestratorError( - 'must pass --force to REMOVE daemon with potentially PRECIOUS DATA for %s' % name) + return HandleCommandResult(stderr=f"must pass --force to REMOVE daemon with potentially PRECIOUS DATA for {name}", retval=-errno.EPERM) completion = self.remove_daemons(names) - raise_if_exception(completion) - return HandleCommandResult(stdout=completion.result_str()) + return completion_to_result(completion) @_cli_write_command('orch rm') def _service_rm(self, diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py index 079e8d7f84b..df662f4ad9e 100644 --- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py +++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py @@ -143,7 +143,7 @@ def test_handle_command(): m = OrchestratorCli('orchestrator', 0, 0) r = m._handle_command(None, cmd) assert r == HandleCommandResult( - retval=-2, stdout='', stderr='No orchestrator configured (try `ceph orch set backend`)') + retval=2, stdout='', stderr='No orchestrator configured (try `ceph orch set backend`)') r = OrchResult([ServiceDescription(spec=ServiceSpec(service_type='osd'), running=123)])