]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: cleanup daemon keyring on failed daemon deploy
authorSebastian Wagner <sebastian.wagner@suse.com>
Wed, 3 Feb 2021 15:24:25 +0000 (16:24 +0100)
committerSebastian Wagner <sebastian.wagner@suse.com>
Tue, 23 Feb 2021 09:58:23 +0000 (10:58 +0100)
Fixes: https://tracker.ceph.com/issues/48164
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
(cherry picked from commit f200cb26cc0d796c3af583e1d2bf72ec65213cd6)

src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/services/cephadmservice.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/tests/__init__.py

index 3dfec9d698f9d127d9a9f3e6bd54ed394a13320f..13ae1fec1f1bebce88f83f17ca2373c91fe2a73c 100644 (file)
@@ -584,7 +584,8 @@ class CephadmServe:
             daemon_ids = [d.daemon_id for d in remove_daemon_hosts]
             assert None not in daemon_ids
             # setting force flag retains previous behavior, should revisit later.
-            r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids), force=True)
+            r = self.mgr.cephadm_services[service_type].ok_to_stop(
+                cast(List[str], daemon_ids), force=True)
             return not r.retval
 
         while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts):
@@ -735,111 +736,114 @@ class CephadmServe:
                 hostname=daemon_spec.host,
         ).service_id(), overwrite=True):
 
-            image = ''
-            start_time = datetime_now()
-            ports: List[int] = daemon_spec.ports if daemon_spec.ports else []
-
-            if daemon_spec.daemon_type == 'container':
-                spec: Optional[CustomContainerSpec] = daemon_spec.spec
-                if spec is None:
-                    # Exit here immediately because the required service
-                    # spec to create a daemon is not provided. This is only
-                    # provided when a service is applied via 'orch apply'
-                    # command.
-                    msg = "Failed to {} daemon {} on {}: Required " \
-                          "service specification not provided".format(
-                              'reconfigure' if reconfig else 'deploy',
-                              daemon_spec.name(), daemon_spec.host)
-                    self.log.info(msg)
-                    return msg
-                image = spec.image
-                if spec.ports:
-                    ports.extend(spec.ports)
-
-            if daemon_spec.daemon_type == 'cephadm-exporter':
-                if not reconfig:
-                    assert daemon_spec.host
-                    deploy_ok = self._deploy_cephadm_binary(daemon_spec.host)
-                    if not deploy_ok:
-                        msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}"
-                        self.log.warning(msg)
+            try:
+                image = ''
+                start_time = datetime_now()
+                ports: List[int] = daemon_spec.ports if daemon_spec.ports else []
+
+                if daemon_spec.daemon_type == 'container':
+                    spec: Optional[CustomContainerSpec] = daemon_spec.spec
+                    if spec is None:
+                        # Exit here immediately because the required service
+                        # spec to create a daemon is not provided. This is only
+                        # provided when a service is applied via 'orch apply'
+                        # command.
+                        msg = "Failed to {} daemon {} on {}: Required " \
+                              "service specification not provided".format(
+                                  'reconfigure' if reconfig else 'deploy',
+                                  daemon_spec.name(), daemon_spec.host)
+                        self.log.info(msg)
                         return msg
+                    image = spec.image
+                    if spec.ports:
+                        ports.extend(spec.ports)
+
+                if daemon_spec.daemon_type == 'cephadm-exporter':
+                    if not reconfig:
+                        assert daemon_spec.host
+                        deploy_ok = self._deploy_cephadm_binary(daemon_spec.host)
+                        if not deploy_ok:
+                            msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}"
+                            self.log.warning(msg)
+                            return msg
+
+                if daemon_spec.daemon_type == 'haproxy':
+                    haspec = cast(HA_RGWSpec, daemon_spec.spec)
+                    if haspec.haproxy_container_image:
+                        image = haspec.haproxy_container_image
+
+                if daemon_spec.daemon_type == 'keepalived':
+                    haspec = cast(HA_RGWSpec, daemon_spec.spec)
+                    if haspec.keepalived_container_image:
+                        image = haspec.keepalived_container_image
+
+                cephadm_config, deps = self.mgr.cephadm_services[daemon_type_to_service(daemon_spec.daemon_type)].generate_config(
+                    daemon_spec)
+
+                # TCP port to open in the host firewall
+                if len(ports) > 0:
+                    daemon_spec.extra_args.extend([
+                        '--tcp-ports', ' '.join(map(str, ports))
+                    ])
+
+                # osd deployments needs an --osd-uuid arg
+                if daemon_spec.daemon_type == 'osd':
+                    if not osd_uuid_map:
+                        osd_uuid_map = self.mgr.get_osd_uuid_map()
+                    osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id)
+                    if not osd_uuid:
+                        raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id)
+                    daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid])
+
+                if reconfig:
+                    daemon_spec.extra_args.append('--reconfig')
+                if self.mgr.allow_ptrace:
+                    daemon_spec.extra_args.append('--allow-ptrace')
+
+                if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url:
+                    self._registry_login(daemon_spec.host, self.mgr.registry_url,
+                                         self.mgr.registry_username, self.mgr.registry_password)
 
-            if daemon_spec.daemon_type == 'haproxy':
-                haspec = cast(HA_RGWSpec, daemon_spec.spec)
-                if haspec.haproxy_container_image:
-                    image = haspec.haproxy_container_image
-
-            if daemon_spec.daemon_type == 'keepalived':
-                haspec = cast(HA_RGWSpec, daemon_spec.spec)
-                if haspec.keepalived_container_image:
-                    image = haspec.keepalived_container_image
-
-            cephadm_config, deps = self.mgr.cephadm_services[daemon_type_to_service(daemon_spec.daemon_type)].generate_config(
-                daemon_spec)
-
-            # TCP port to open in the host firewall
-            if len(ports) > 0:
-                daemon_spec.extra_args.extend([
-                    '--tcp-ports', ' '.join(map(str, ports))
-                ])
-
-            # osd deployments needs an --osd-uuid arg
-            if daemon_spec.daemon_type == 'osd':
-                if not osd_uuid_map:
-                    osd_uuid_map = self.mgr.get_osd_uuid_map()
-                osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id)
-                if not osd_uuid:
-                    raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id)
-                daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid])
-
-            if reconfig:
-                daemon_spec.extra_args.append('--reconfig')
-            if self.mgr.allow_ptrace:
-                daemon_spec.extra_args.append('--allow-ptrace')
-
-            if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url:
-                self._registry_login(daemon_spec.host, self.mgr.registry_url,
-                                     self.mgr.registry_username, self.mgr.registry_password)
-
-            daemon_spec.extra_args.extend(['--config-json', '-'])
-
-            self.log.info('%s daemon %s on %s' % (
-                'Reconfiguring' if reconfig else 'Deploying',
-                daemon_spec.name(), daemon_spec.host))
-
-            out, err, code = self._run_cephadm(
-                daemon_spec.host, daemon_spec.name(), 'deploy',
-                [
-                    '--name', daemon_spec.name(),
-                ] + daemon_spec.extra_args,
-                stdin=json.dumps(cephadm_config),
-                image=image)
-            if not code and daemon_spec.host in self.mgr.cache.daemons:
-                # prime cached service state with what we (should have)
-                # just created
-                sd = orchestrator.DaemonDescription()
-                sd.daemon_type = daemon_spec.daemon_type
-                sd.daemon_id = daemon_spec.daemon_id
-                sd.hostname = daemon_spec.host
-                sd.status = 1
-                sd.status_desc = 'starting'
-                self.mgr.cache.add_daemon(daemon_spec.host, sd)
-                if daemon_spec.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager']:
-                    self.mgr.requires_post_actions.add(daemon_spec.daemon_type)
-            self.mgr.cache.invalidate_host_daemons(daemon_spec.host)
-            self.mgr.cache.update_daemon_config_deps(
-                daemon_spec.host, daemon_spec.name(), deps, start_time)
-            self.mgr.cache.save_host(daemon_spec.host)
-            msg = "{} {} on host '{}'".format(
-                'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host)
-            if not code:
-                self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg)
-            else:
-                what = 'reconfigure' if reconfig else 'deploy'
-                self.mgr.events.for_daemon(
-                    daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
-            return msg
+                daemon_spec.extra_args.extend(['--config-json', '-'])
+
+                self.log.info('%s daemon %s on %s' % (
+                    'Reconfiguring' if reconfig else 'Deploying',
+                    daemon_spec.name(), daemon_spec.host))
+
+                out, err, code = self._run_cephadm(
+                    daemon_spec.host, daemon_spec.name(), 'deploy',
+                    [
+                        '--name', daemon_spec.name(),
+                    ] + daemon_spec.extra_args,
+                    stdin=json.dumps(cephadm_config),
+                    image=image)
+                if not code and daemon_spec.host in self.mgr.cache.daemons:
+                    # prime cached service state with what we (should have)
+                    # just created
+                    sd = daemon_spec.to_daemon_description(1, 'starting')
+                    self.mgr.cache.add_daemon(daemon_spec.host, sd)
+                    if daemon_spec.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager']:
+                        self.mgr.requires_post_actions.add(daemon_spec.daemon_type)
+                self.mgr.cache.invalidate_host_daemons(daemon_spec.host)
+                self.mgr.cache.update_daemon_config_deps(
+                    daemon_spec.host, daemon_spec.name(), deps, start_time)
+                self.mgr.cache.save_host(daemon_spec.host)
+                msg = "{} {} on host '{}'".format(
+                    'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host)
+                if not code:
+                    self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg)
+                else:
+                    what = 'reconfigure' if reconfig else 'deploy'
+                    self.mgr.events.for_daemon(
+                        daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
+                return msg
+            except OrchestratorError:
+                if not reconfig:
+                    # we have to clean up the daemon. E.g. keyrings.
+                    servict_type = daemon_type_to_service(daemon_spec.daemon_type)
+                    dd = daemon_spec.to_daemon_description(-1, 'failed')
+                    self.mgr.cephadm_services[servict_type].post_remove(dd)
+                raise
 
     def _remove_daemon(self, name: str, host: str) -> str:
         """
index da298b3e156379f2ca3d3bbbb6e71027281872d5..0ca15a1c78c76417bd0f72394d541399cdbc9627 100644 (file)
@@ -79,6 +79,15 @@ class CephadmDaemonSpec(Generic[ServiceSpecs]):
 
         return files
 
+    def to_daemon_description(self, status: int, status_desc: str) -> DaemonDescription:
+        return DaemonDescription(
+            daemon_type=self.daemon_type,
+            daemon_id=self.daemon_id,
+            hostname=self.host,
+            status=status,
+            status_desc=status_desc
+        )
+
 
 class CephadmService(metaclass=ABCMeta):
     """
@@ -355,7 +364,7 @@ class CephService(CephadmService):
         entity = self.get_auth_entity(daemon_id, host=host)
 
         logger.info(f'Remove keyring: {entity}')
-        ret, out, err = self.mgr.check_mon_command({
+        ret, out, err = self.mgr.mon_command({
             'prefix': 'auth rm',
             'entity': entity,
         })
index 4fc21ebc63cc3fdab79d2beda238b91416d8cf1f..79e344b138bab321741925e53082bee2f56281f7 100644 (file)
@@ -585,6 +585,22 @@ class TestCephadm(object):
             with with_daemon(cephadm_module, spec, meth, 'test'):
                 pass
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    def test_daemon_add_fail(self, _run_cephadm, cephadm_module):
+        _run_cephadm.return_value = '{}', '', 0
+        with with_host(cephadm_module, 'test'):
+            spec = ServiceSpec(
+                service_type='mgr',
+                placement=PlacementSpec(hosts=[HostPlacementSpec('test', '', 'x')], count=1)
+            )
+            _run_cephadm.side_effect = OrchestratorError('fail')
+            with pytest.raises(OrchestratorError):
+                wait(cephadm_module, cephadm_module.add_mgr(spec))
+            cephadm_module.assert_issued_mon_command({
+                'prefix': 'auth rm',
+                'entity': 'mgr.x',
+            })
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     @mock.patch("cephadm.module.CephadmOrchestrator.rados", mock.MagicMock())
     def test_nfs(self, cephadm_module):
index cee20003f4f699488eba7be0c93f7a8d89980e3a..dd19aac4e8d59bc0ec3b3532d11b4de9a0d4aa37 100644 (file)
@@ -99,7 +99,9 @@ if 'UNITTEST' in os.environ:
             return self.mock_store_get('_ceph_get', data_name, mock.MagicMock())
 
         def _ceph_send_command(self, res, svc_type, svc_id, command, tag, inbuf):
+
             cmd = json.loads(command)
+            getattr(self, '_mon_commands_sent', []).append(cmd)
 
             # Mocking the config store is handy sometimes:
             def config_get():
@@ -139,6 +141,9 @@ if 'UNITTEST' in os.environ:
 
             res.complete(0, outb, '')
 
+        def assert_issued_mon_command(self, command):
+            assert command in self._mon_commands_sent, self._mon_commands_sent
+
         @property
         def _logger(self):
             return logging.getLogger(__name__)
@@ -148,6 +153,7 @@ if 'UNITTEST' in os.environ:
             pass
 
         def __init__(self, *args):
+            self._mon_commands_sent = []
             if not hasattr(self, '_store'):
                 self._store = {}