From 41f1f010297aada2fd87ee6881abf6a77bcc4c0d Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 16 Feb 2024 11:24:32 -0500 Subject: [PATCH] mgr/cephadm: catch CancelledError in asyncio timeout handler Specifically, concurrent.futures.CancelledError. At least on python 3.9, this error can be raised when certain commands being run asynchronously fail. Not catching this results in the whole cephadm module crashing with something like Traceback (most recent call last): File "/usr/share/ceph/mgr/cephadm/utils.py", line 94, in do_work return f(*arg) File "/usr/share/ceph/mgr/cephadm/serve.py", line 267, in refresh r = self._refresh_facts(host) File "/usr/share/ceph/mgr/cephadm/serve.py", line 370, in _refresh_facts val = self.mgr.wait_async(self._run_cephadm_json( File "/usr/share/ceph/mgr/cephadm/module.py", line 671, in wait_async return self.event_loop.get_result(coro, timeout) File "/usr/share/ceph/mgr/cephadm/ssh.py", line 64, in get_result return future.result(timeout) File "/lib64/python3.9/concurrent/futures/_base.py", line 444, in result raise CancelledError() concurrent.futures._base.CancelledError Fixes: https://tracker.ceph.com/issues/64473 Signed-off-by: Adam King (cherry picked from commit 9c34973932bf3a0ec50c1c63bcba5e35bfe407e5) --- src/pybind/mgr/cephadm/module.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 08f28d3456f74..8a0546e24ae26 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -666,6 +666,16 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, else: err_str += (f'(default {self.default_cephadm_command_timeout} second timeout)') raise OrchestratorError(err_str) + except concurrent.futures.CancelledError as e: + err_str = '' + if cmd: + err_str = f'Command "{cmd}" failed ' + else: + err_str = 'Command failed ' + if host: + err_str += f'on host {host} ' + err_str += f' - {str(e)}' + raise OrchestratorError(err_str) def set_container_image(self, entity: str, image: str) -> None: self.check_mon_command({ -- 2.39.5