From: Adam King Date: Mon, 14 Oct 2024 17:44:03 +0000 (-0400) Subject: cephadm: handle "systemctl start" failures during deployment better X-Git-Tag: v20.0.0~645^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5818305e8094f88949a7a63c93c6d76d0efa03d9;p=ceph.git cephadm: handle "systemctl start" failures during deployment better Previously it was assumed when the deploy command fails whatever daemon we were trying to deploy does not exist on the host. However, in the specific case where deploy fails trying to start the daemon's systemd unit this is not the case. This leads us to both cleanup the keyring for the daemon and also causes us to not trigger a refresh of the daemons on the host which can make cephadm attempt to deploy another daemon instead of just reporting the existing one as failed. To get around this we need to handle that specific failure as a success in the mgr module's deploy workflow so that we refresh the daemons and report the failure as intended. https://tracker.ceph.com/issues/68536 Signed-off-by: Adam King --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index e32e2bc49f31a..5d11d4700ff77 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -57,6 +57,7 @@ from cephadmlib.constants import ( LOG_DIR_MODE, SYSCTL_DIR, UNIT_DIR, + DAEMON_FAILED_ERROR, ) from cephadmlib.context import CephadmContext from cephadmlib.context_getters import ( @@ -72,6 +73,7 @@ from cephadmlib.exceptions import ( ClusterAlreadyExists, Error, UnauthorizedRegistryError, + DaemonStartException, ) from cephadmlib.exe_utils import find_executable, find_program from cephadmlib.call_wrappers import ( @@ -1246,7 +1248,11 @@ def deploy_daemon_units( call_throws(ctx, ['systemctl', 'enable', unit_name]) if start: clean_cgroup(ctx, ident.fsid, unit_name) - call_throws(ctx, ['systemctl', 'start', unit_name]) + try: + call_throws(ctx, ['systemctl', 'start', unit_name]) + except Exception as e: + logger.error(f'systemctl start failed for {unit_name}: {str(e)}') + raise DaemonStartException() def _osd_unit_run_commands( @@ -3046,7 +3052,10 @@ def get_deployment_type( @deprecated_command def command_deploy(ctx): # type: (CephadmContext) -> None - _common_deploy(ctx) + try: + _common_deploy(ctx) + except DaemonStartException: + sys.exit(DAEMON_FAILED_ERROR) def apply_deploy_config_to_ctx( @@ -3089,7 +3098,10 @@ def command_deploy_from(ctx: CephadmContext) -> None: config_data = read_configuration_source(ctx) logger.debug('Loaded deploy configuration: %r', config_data) apply_deploy_config_to_ctx(config_data, ctx) - _common_deploy(ctx) + try: + _common_deploy(ctx) + except DaemonStartException: + sys.exit(DAEMON_FAILED_ERROR) def _common_deploy(ctx: CephadmContext) -> None: diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py index 354c378239802..6622f1eca554b 100644 --- a/src/cephadm/cephadmlib/constants.py +++ b/src/cephadm/cephadmlib/constants.py @@ -54,3 +54,4 @@ QUIET_LOG_LEVEL = 9 # DEBUG is 10, so using 9 to be lower level than DEBUG NO_DEPRECATED = False UID_NOBODY = 65534 GID_NOGROUP = 65534 +DAEMON_FAILED_ERROR = 17 diff --git a/src/cephadm/cephadmlib/exceptions.py b/src/cephadm/cephadmlib/exceptions.py index 0d215fdd3325a..762ce78212718 100644 --- a/src/cephadm/cephadmlib/exceptions.py +++ b/src/cephadm/cephadmlib/exceptions.py @@ -19,3 +19,16 @@ class UnauthorizedRegistryError(Error): class PortOccupiedError(Error): pass + + +class DaemonStartException(Exception): + """ + Special exception type we raise when the + systemctl start command fails during daemon + deployment. Necessary because the cephadm mgr module + needs to handle this case differently than a failure + earlier in the deploy process where no attempt was made + to actually start the daemon + """ + + pass diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 4a7959ae04502..8e9cd00fa8136 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1436,8 +1436,24 @@ class CephadmServe: config_blobs=daemon_spec.final_config, ).dump_json_str(), use_current_daemon_image=reconfig, + error_ok=True ) + # return number corresponding to DAEMON_FAILED_ERROR + # in src/cephadm/cephadmlib/constants. + # TODO: link these together so one cannot be changed without the other + if code == 17: + # daemon failed on systemctl start command, meaning while + # deployment failed the daemon is present and we should handle + # this as if the deploy command "succeeded" and mark the daemon + # as failed later when we fetch its status + self.mgr.log.error(f'Deployment of {daemon_spec.name()} failed during "systemctl start" command') + elif code: + # some other failure earlier in the deploy process. Just raise an exception + # the same as we would in _run_cephadm on a nonzero rc + raise OrchestratorError( + f'cephadm exited with an error code: {code}, stderr: {err}') + if daemon_spec.daemon_type == 'agent': self.mgr.agent_cache.agent_timestamp[daemon_spec.host] = datetime_now() self.mgr.agent_cache.agent_counter[daemon_spec.host] = 1 diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 975c125225dc8..6a2870a85c140 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -136,7 +136,7 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str mock.call(host, 'osd', 'ceph-volume', ['--', 'lvm', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), - mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, use_current_daemon_image=False), + mock.call(host, f'osd.{osd_id}', ['_orch', 'deploy'], [], stdin=mock.ANY, error_ok=True, use_current_daemon_image=False), mock.call(host, 'osd', 'ceph-volume', ['--', 'raw', 'list', '--format', 'json'], no_fsid=False, error_ok=False, image='', log_output=True, use_current_daemon_image=False), @@ -563,6 +563,7 @@ class TestCephadm(object): }, }, }), + error_ok=True, use_current_daemon_image=True, ) @@ -618,6 +619,7 @@ class TestCephadm(object): "crush_location": "datacenter=a", }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -660,6 +662,7 @@ class TestCephadm(object): "keyring": "[client.crash.test]\nkey = None\n", }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -702,6 +705,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + error_ok=True, use_current_daemon_image=False, ) @@ -752,6 +756,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + error_ok=True, use_current_daemon_image=False, ) @@ -806,6 +811,7 @@ class TestCephadm(object): }, "config_blobs": {}, }), + error_ok=True, use_current_daemon_image=False, ) diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 84d7c8f5b136d..a54c3c86b8df7 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -349,6 +349,7 @@ log_to_file = False""" }, } }), + error_ok=True, use_current_daemon_image=False, ) @@ -489,6 +490,7 @@ timeout = 1.0\n""" } } }), + error_ok=True, use_current_daemon_image=False, ) @@ -601,6 +603,7 @@ class TestMonitoring: "use_url_prefix": False, } }), + error_ok=True, use_current_daemon_image=False, ) @@ -696,6 +699,7 @@ class TestMonitoring: "use_url_prefix": True, } }), + error_ok=True, use_current_daemon_image=False, ) @@ -788,6 +792,7 @@ class TestMonitoring: "use_url_prefix": False, } }), + error_ok=True, use_current_daemon_image=False, ) @@ -833,6 +838,7 @@ class TestMonitoring: "files": { "ceph-exporter.crt": "mycert", "ceph-exporter.key": "mykey"}}}), + error_ok=True, use_current_daemon_image=False) @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -876,6 +882,7 @@ class TestMonitoring: }, "config_blobs": {} }), + error_ok=True, use_current_daemon_image=False, ) @@ -936,6 +943,7 @@ class TestMonitoring: 'web_config': '/etc/node-exporter/web.yml', } }), + error_ok=True, use_current_daemon_image=False, ) @@ -1071,6 +1079,7 @@ class TestMonitoring: "use_url_prefix": False }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1303,6 +1312,7 @@ class TestMonitoring: "use_url_prefix": False }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1379,6 +1389,7 @@ class TestMonitoring: }, }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1438,6 +1449,7 @@ class TestMonitoring: }, }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1588,6 +1600,7 @@ class TestMonitoring: "files": files, }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1718,6 +1731,7 @@ class TestMonitoring: "files": files, }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -1831,6 +1845,7 @@ class TestMonitoring: "files": files, }, }), + error_ok=True, use_current_daemon_image=False, ) @@ -2005,6 +2020,7 @@ spec: }, "config_blobs": {}, }), + error_ok=True, use_current_daemon_image=False, ) @@ -2112,6 +2128,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -2160,6 +2177,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -2212,6 +2230,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -2269,6 +2288,7 @@ class TestSNMPGateway: }, "config_blobs": config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -3359,6 +3379,7 @@ class TestJaeger: }, "config_blobs": config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -3399,6 +3420,7 @@ class TestJaeger: }, "config_blobs": es_config, }), + error_ok=True, use_current_daemon_image=False, ) with with_service(cephadm_module, collector_spec): @@ -3427,6 +3449,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -3467,6 +3490,7 @@ class TestJaeger: }, "config_blobs": collector_config, }), + error_ok=True, use_current_daemon_image=False, ) with with_service(cephadm_module, agent_spec): @@ -3495,6 +3519,7 @@ class TestJaeger: }, "config_blobs": agent_config, }), + error_ok=True, use_current_daemon_image=False, ) @@ -3552,6 +3577,7 @@ class TestCustomContainer: }, } ), + error_ok=True, use_current_daemon_image=False, ) @@ -3639,6 +3665,7 @@ class TestCustomContainer: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, ) @@ -3692,6 +3719,7 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, ) @@ -3764,6 +3792,7 @@ class TestSMB: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, ) @@ -4009,6 +4038,7 @@ class TestMgmtGateway: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, ) @@ -4350,6 +4380,7 @@ class TestMgmtGateway: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, ) @@ -4473,5 +4504,6 @@ class TestMgmtGateway: ['_orch', 'deploy'], [], stdin=json.dumps(expected), + error_ok=True, use_current_daemon_image=False, )