From d7a4c5794034e60e94dd12951f7dbf4685647686 Mon Sep 17 00:00:00 2001 From: Sebastian Wagner Date: Tue, 31 Aug 2021 11:38:14 +0200 Subject: [PATCH] mgr/cephadm: Add OSDService.post_remove() Do not remove the osd.N keyring, if we failed to deploy the OSD, because we cannot recover from it. The OSD keys are created by ceph-volume and not by us. Signed-off-by: Sebastian Wagner --- src/pybind/mgr/cephadm/module.py | 2 +- src/pybind/mgr/cephadm/serve.py | 5 ++- .../mgr/cephadm/services/cephadmservice.py | 12 +++--- src/pybind/mgr/cephadm/services/iscsi.py | 2 +- src/pybind/mgr/cephadm/services/nfs.py | 4 +- src/pybind/mgr/cephadm/services/osd.py | 9 ++++- src/pybind/mgr/cephadm/tests/test_cephadm.py | 40 +++++++++++++++++++ 7 files changed, 61 insertions(+), 13 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 27d3734e28a0..c55e044e6541 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1424,7 +1424,7 @@ Then run the following: if d.daemon_type != 'osd': self.cephadm_services[str(d.daemon_type)].pre_remove(d) - self.cephadm_services[str(d.daemon_type)].post_remove(d) + self.cephadm_services[str(d.daemon_type)].post_remove(d, is_failed_deploy=False) else: cmd_args = { 'prefix': 'osd purge-actual', diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 2f0e87a8a401..7d2398b7164a 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1085,7 +1085,7 @@ class CephadmServe: # we have to clean up the daemon. E.g. keyrings. servict_type = daemon_type_to_service(daemon_spec.daemon_type) dd = daemon_spec.to_daemon_description(DaemonDescriptionStatus.error, 'failed') - self.mgr.cephadm_services[servict_type].post_remove(dd) + self.mgr.cephadm_services[servict_type].post_remove(dd, is_failed_deploy=True) raise def _remove_daemon(self, name: str, host: str) -> str: @@ -1113,7 +1113,8 @@ class CephadmServe: self.mgr.cache.rm_daemon(host, name) self.mgr.cache.invalidate_host_daemons(host) - self.mgr.cephadm_services[daemon_type_to_service(daemon_type)].post_remove(daemon) + self.mgr.cephadm_services[daemon_type_to_service( + daemon_type)].post_remove(daemon, is_failed_deploy=False) return "Removed {} from host '{}'".format(name, host) diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 17426eb41dbd..2453d5d38940 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -401,7 +401,7 @@ class CephadmService(metaclass=ABCMeta): assert self.TYPE == daemon_type_to_service(daemon.daemon_type) logger.debug(f'Pre remove daemon {self.TYPE}.{daemon.daemon_id}') - def post_remove(self, daemon: DaemonDescription) -> None: + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: """ Called after the daemon is removed. """ @@ -429,8 +429,8 @@ class CephService(CephadmService): return cephadm_config, [] - def post_remove(self, daemon: DaemonDescription) -> None: - super().post_remove(daemon) + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: + super().post_remove(daemon, is_failed_deploy=is_failed_deploy) self.remove_keyring(daemon) def get_auth_entity(self, daemon_id: str, host: str = "") -> AuthEntity: @@ -584,7 +584,7 @@ class MonService(CephService): 'name': daemon_id, }) - def post_remove(self, daemon: DaemonDescription) -> None: + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: # Do not remove the mon keyring. # super().post_remove(daemon) pass @@ -880,8 +880,8 @@ class RgwService(CephService): }) self.mgr.trigger_connect_dashboard_rgw() - def post_remove(self, daemon: DaemonDescription) -> None: - super().post_remove(daemon) + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: + super().post_remove(daemon, is_failed_deploy=is_failed_deploy) self.mgr.check_mon_command({ 'prefix': 'config rm', 'who': utils.name_to_config_section(daemon.name()), diff --git a/src/pybind/mgr/cephadm/services/iscsi.py b/src/pybind/mgr/cephadm/services/iscsi.py index 3eecb07e9439..750ed956f853 100644 --- a/src/pybind/mgr/cephadm/services/iscsi.py +++ b/src/pybind/mgr/cephadm/services/iscsi.py @@ -144,7 +144,7 @@ class IscsiService(CephService): warn_message = f'It is presumed safe to stop {names}' return HandleCommandResult(0, warn_message, '') - def post_remove(self, daemon: DaemonDescription) -> None: + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: """ Called after the daemon is removed. """ diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index 09c1da32932a..ee53283bd9c1 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -246,8 +246,8 @@ class NFSService(CephService): 'entity': entity, }) - def post_remove(self, daemon: DaemonDescription) -> None: - super().post_remove(daemon) + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: + super().post_remove(daemon, is_failed_deploy=is_failed_deploy) self.remove_rgw_keyring(daemon) def ok_to_stop(self, diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 3c30a4139802..7b63fe7aa4e9 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -14,7 +14,7 @@ import orchestrator from cephadm.serve import CephadmServe from cephadm.utils import forall_hosts from ceph.utils import datetime_now -from orchestrator import OrchestratorError +from orchestrator import OrchestratorError, DaemonDescription from mgr_module import MonCommandFailed from cephadm.services.cephadmservice import CephadmDaemonDeploySpec, CephService @@ -306,6 +306,13 @@ class OSDService(CephService): def get_osdspec_affinity(self, osd_id: str) -> str: return self.mgr.get('osd_metadata').get(osd_id, {}).get('osdspec_affinity', '') + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: + # Do not remove the osd.N keyring, if we failed to deploy the OSD, because + # we cannot recover from it. The OSD keys are created by ceph-volume and not by + # us. + if not is_failed_deploy: + super().post_remove(daemon, is_failed_deploy=is_failed_deploy) + class OsdIdClaims(object): """ diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 951d81f57618..296047a6fbeb 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1358,6 +1358,46 @@ Traceback (most recent call last): stdin=mock.ANY, image=''), ] + @mock.patch("cephadm.serve.CephadmServe._run_cephadm") + def test_osd_activate_datadevice_fail(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.return_value = ('{}', '', 0) + with with_host(cephadm_module, 'test', refresh_hosts=False): + cephadm_module.mock_store_set('_ceph_get', 'osd_map', { + 'osds': [ + { + 'osd': 1, + 'up_from': 0, + 'uuid': 'uuid' + } + ] + }) + + ceph_volume_lvm_list = { + '1': [{ + 'tags': { + 'ceph.cluster_fsid': cephadm_module._cluster_fsid, + 'ceph.osd_fsid': 'uuid' + }, + 'type': 'data' + }] + } + _run_cephadm.reset_mock(return_value=True) + + def _r_c(*args, **kwargs): + if 'ceph-volume' in args: + return (json.dumps(ceph_volume_lvm_list), '', 0) + else: + assert 'deploy' in args + raise OrchestratorError("let's fail somehow") + _run_cephadm.side_effect = _r_c + assert cephadm_module._osd_activate( + ['test']).stderr == "let's fail somehow" + with pytest.raises(AssertionError): + cephadm_module.assert_issued_mon_command({ + 'prefix': 'auth rm', + 'entity': 'osd.1', + }) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm") def test_osd_activate_datadevice_dbdevice(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.return_value = ('{}', '', 0) -- 2.47.3