]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Add OSDService.post_remove() 42989/head
authorSebastian Wagner <sewagner@redhat.com>
Tue, 31 Aug 2021 09:38:14 +0000 (11:38 +0200)
committerSebastian Wagner <sewagner@redhat.com>
Wed, 1 Sep 2021 11:00:26 +0000 (13:00 +0200)
Do not remove the osd.N keyring, if we failed to deploy the OSD, because
we cannot recover from it. The OSD keys are created by ceph-volume and not by
us.

Signed-off-by: Sebastian Wagner <sewagner@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/services/cephadmservice.py
src/pybind/mgr/cephadm/services/iscsi.py
src/pybind/mgr/cephadm/services/nfs.py
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 27d3734e28a0a86bff7d7bfd46b16d5800ba24fc..c55e044e6541ad98e41bf832460a2d0a99282a68 100644 (file)
@@ -1424,7 +1424,7 @@ Then run the following:
 
                 if d.daemon_type != 'osd':
                     self.cephadm_services[str(d.daemon_type)].pre_remove(d)
-                    self.cephadm_services[str(d.daemon_type)].post_remove(d)
+                    self.cephadm_services[str(d.daemon_type)].post_remove(d, is_failed_deploy=False)
                 else:
                     cmd_args = {
                         'prefix': 'osd purge-actual',
index 2f0e87a8a401caa78f7e84a0de5fb6b427d28c8b..7d2398b7164a64ff97c8d5bb9b6ddd00e98d7083 100644 (file)
@@ -1085,7 +1085,7 @@ class CephadmServe:
                     # we have to clean up the daemon. E.g. keyrings.
                     servict_type = daemon_type_to_service(daemon_spec.daemon_type)
                     dd = daemon_spec.to_daemon_description(DaemonDescriptionStatus.error, 'failed')
-                    self.mgr.cephadm_services[servict_type].post_remove(dd)
+                    self.mgr.cephadm_services[servict_type].post_remove(dd, is_failed_deploy=True)
                 raise
 
     def _remove_daemon(self, name: str, host: str) -> str:
@@ -1113,7 +1113,8 @@ class CephadmServe:
                 self.mgr.cache.rm_daemon(host, name)
             self.mgr.cache.invalidate_host_daemons(host)
 
-            self.mgr.cephadm_services[daemon_type_to_service(daemon_type)].post_remove(daemon)
+            self.mgr.cephadm_services[daemon_type_to_service(
+                daemon_type)].post_remove(daemon, is_failed_deploy=False)
 
             return "Removed {} from host '{}'".format(name, host)
 
index 17426eb41dbdaf69ace9e98e4afbd2d0947dedb8..2453d5d3894060b462605a7a92ef0145446ce1f9 100644 (file)
@@ -401,7 +401,7 @@ class CephadmService(metaclass=ABCMeta):
         assert self.TYPE == daemon_type_to_service(daemon.daemon_type)
         logger.debug(f'Pre remove daemon {self.TYPE}.{daemon.daemon_id}')
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         """
         Called after the daemon is removed.
         """
@@ -429,8 +429,8 @@ class CephService(CephadmService):
 
         return cephadm_config, []
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
-        super().post_remove(daemon)
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
+        super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
         self.remove_keyring(daemon)
 
     def get_auth_entity(self, daemon_id: str, host: str = "") -> AuthEntity:
@@ -584,7 +584,7 @@ class MonService(CephService):
             'name': daemon_id,
         })
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         # Do not remove the mon keyring.
         # super().post_remove(daemon)
         pass
@@ -880,8 +880,8 @@ class RgwService(CephService):
         })
         self.mgr.trigger_connect_dashboard_rgw()
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
-        super().post_remove(daemon)
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
+        super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
         self.mgr.check_mon_command({
             'prefix': 'config rm',
             'who': utils.name_to_config_section(daemon.name()),
index 3eecb07e943977e1854b0c68f320724357d2933a..750ed956f853272f760a2d5c23553a15837a4588 100644 (file)
@@ -144,7 +144,7 @@ class IscsiService(CephService):
         warn_message = f'It is presumed safe to stop {names}'
         return HandleCommandResult(0, warn_message, '')
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         """
         Called after the daemon is removed.
         """
index 09c1da32932a8a109d2f127f4c2cc0bb05ac80b2..ee53283bd9c185aa135a3d06efdde6ba62316346 100644 (file)
@@ -246,8 +246,8 @@ class NFSService(CephService):
             'entity': entity,
         })
 
-    def post_remove(self, daemon: DaemonDescription) -> None:
-        super().post_remove(daemon)
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
+        super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
         self.remove_rgw_keyring(daemon)
 
     def ok_to_stop(self,
index 3c30a413980210f4077ad35d47622fb2322b0f87..7b63fe7aa4e93486e4edaab47cad27a9a08e4d22 100644 (file)
@@ -14,7 +14,7 @@ import orchestrator
 from cephadm.serve import CephadmServe
 from cephadm.utils import forall_hosts
 from ceph.utils import datetime_now
-from orchestrator import OrchestratorError
+from orchestrator import OrchestratorError, DaemonDescription
 from mgr_module import MonCommandFailed
 
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec, CephService
@@ -306,6 +306,13 @@ class OSDService(CephService):
     def get_osdspec_affinity(self, osd_id: str) -> str:
         return self.mgr.get('osd_metadata').get(osd_id, {}).get('osdspec_affinity', '')
 
+    def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
+        # Do not remove the osd.N keyring, if we failed to deploy the OSD, because
+        # we cannot recover from it. The OSD keys are created by ceph-volume and not by
+        # us.
+        if not is_failed_deploy:
+            super().post_remove(daemon, is_failed_deploy=is_failed_deploy)
+
 
 class OsdIdClaims(object):
     """
index 951d81f5761862e146e0f5cde89347c0a2f06e4d..296047a6fbeb3dc1d198b3c95434274d515e1aa4 100644 (file)
@@ -1358,6 +1358,46 @@ Traceback (most recent call last):
                           stdin=mock.ANY, image=''),
             ]
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    def test_osd_activate_datadevice_fail(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.return_value = ('{}', '', 0)
+        with with_host(cephadm_module, 'test', refresh_hosts=False):
+            cephadm_module.mock_store_set('_ceph_get', 'osd_map', {
+                'osds': [
+                    {
+                        'osd': 1,
+                        'up_from': 0,
+                        'uuid': 'uuid'
+                    }
+                ]
+            })
+
+            ceph_volume_lvm_list = {
+                '1': [{
+                    'tags': {
+                        'ceph.cluster_fsid': cephadm_module._cluster_fsid,
+                        'ceph.osd_fsid': 'uuid'
+                    },
+                    'type': 'data'
+                }]
+            }
+            _run_cephadm.reset_mock(return_value=True)
+
+            def _r_c(*args, **kwargs):
+                if 'ceph-volume' in args:
+                    return (json.dumps(ceph_volume_lvm_list), '', 0)
+                else:
+                    assert 'deploy' in args
+                    raise OrchestratorError("let's fail somehow")
+            _run_cephadm.side_effect = _r_c
+            assert cephadm_module._osd_activate(
+                ['test']).stderr == "let's fail somehow"
+            with pytest.raises(AssertionError):
+                cephadm_module.assert_issued_mon_command({
+                    'prefix': 'auth rm',
+                    'entity': 'osd.1',
+                })
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
     def test_osd_activate_datadevice_dbdevice(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.return_value = ('{}', '', 0)