From 9c34973932bf3a0ec50c1c63bcba5e35bfe407e5 Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 16 Feb 2024 11:24:32 -0500 Subject: [PATCH] mgr/cephadm: catch CancelledError in asyncio timeout handler Specifically, concurrent.futures.CancelledError. At least on python 3.9, this error can be raised when certain commands being run asynchronously fail. Not catching this results in the whole cephadm module crashing with something like Traceback (most recent call last): File "/usr/share/ceph/mgr/cephadm/utils.py", line 94, in do_work return f(*arg) File "/usr/share/ceph/mgr/cephadm/serve.py", line 267, in refresh r = self._refresh_facts(host) File "/usr/share/ceph/mgr/cephadm/serve.py", line 370, in _refresh_facts val = self.mgr.wait_async(self._run_cephadm_json( File "/usr/share/ceph/mgr/cephadm/module.py", line 671, in wait_async return self.event_loop.get_result(coro, timeout) File "/usr/share/ceph/mgr/cephadm/ssh.py", line 64, in get_result return future.result(timeout) File "/lib64/python3.9/concurrent/futures/_base.py", line 444, in result raise CancelledError() concurrent.futures._base.CancelledError Fixes: https://tracker.ceph.com/issues/64473 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 87f7024bb2555..7d9aa02bade46 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -761,6 +761,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, else: err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)') raise OrchestratorError(err_str) + except concurrent.futures.CancelledError as e: + err_str = '' + if cmd: + err_str = f'Command "{cmd}" failed ' + else: + err_str = 'Command failed ' + if host: + err_str += f'on host {host} ' + err_str += f' - {str(e)}' + raise OrchestratorError(err_str) def set_container_image(self, entity: str, image: str) -> None: self.check_mon_command({ -- 2.39.5