From: Sebastian Wagner Date: Wed, 3 Feb 2021 15:24:25 +0000 (+0100) Subject: mgr/cephadm: cleanup daemon keyring on failed daemon deploy X-Git-Tag: v16.2.0~178^2~57 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=18ae6e209f7d19f98a39775b7a640ae5feb5e86b;p=ceph.git mgr/cephadm: cleanup daemon keyring on failed daemon deploy Fixes: https://tracker.ceph.com/issues/48164 Signed-off-by: Sebastian Wagner (cherry picked from commit f200cb26cc0d796c3af583e1d2bf72ec65213cd6) --- diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 3dfec9d698f9..13ae1fec1f1b 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -584,7 +584,8 @@ class CephadmServe: daemon_ids = [d.daemon_id for d in remove_daemon_hosts] assert None not in daemon_ids # setting force flag retains previous behavior, should revisit later. - r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids), force=True) + r = self.mgr.cephadm_services[service_type].ok_to_stop( + cast(List[str], daemon_ids), force=True) return not r.retval while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts): @@ -735,111 +736,114 @@ class CephadmServe: hostname=daemon_spec.host, ).service_id(), overwrite=True): - image = '' - start_time = datetime_now() - ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] - - if daemon_spec.daemon_type == 'container': - spec: Optional[CustomContainerSpec] = daemon_spec.spec - if spec is None: - # Exit here immediately because the required service - # spec to create a daemon is not provided. This is only - # provided when a service is applied via 'orch apply' - # command. - msg = "Failed to {} daemon {} on {}: Required " \ - "service specification not provided".format( - 'reconfigure' if reconfig else 'deploy', - daemon_spec.name(), daemon_spec.host) - self.log.info(msg) - return msg - image = spec.image - if spec.ports: - ports.extend(spec.ports) - - if daemon_spec.daemon_type == 'cephadm-exporter': - if not reconfig: - assert daemon_spec.host - deploy_ok = self._deploy_cephadm_binary(daemon_spec.host) - if not deploy_ok: - msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" - self.log.warning(msg) + try: + image = '' + start_time = datetime_now() + ports: List[int] = daemon_spec.ports if daemon_spec.ports else [] + + if daemon_spec.daemon_type == 'container': + spec: Optional[CustomContainerSpec] = daemon_spec.spec + if spec is None: + # Exit here immediately because the required service + # spec to create a daemon is not provided. This is only + # provided when a service is applied via 'orch apply' + # command. + msg = "Failed to {} daemon {} on {}: Required " \ + "service specification not provided".format( + 'reconfigure' if reconfig else 'deploy', + daemon_spec.name(), daemon_spec.host) + self.log.info(msg) return msg + image = spec.image + if spec.ports: + ports.extend(spec.ports) + + if daemon_spec.daemon_type == 'cephadm-exporter': + if not reconfig: + assert daemon_spec.host + deploy_ok = self._deploy_cephadm_binary(daemon_spec.host) + if not deploy_ok: + msg = f"Unable to deploy the cephadm binary to {daemon_spec.host}" + self.log.warning(msg) + return msg + + if daemon_spec.daemon_type == 'haproxy': + haspec = cast(HA_RGWSpec, daemon_spec.spec) + if haspec.haproxy_container_image: + image = haspec.haproxy_container_image + + if daemon_spec.daemon_type == 'keepalived': + haspec = cast(HA_RGWSpec, daemon_spec.spec) + if haspec.keepalived_container_image: + image = haspec.keepalived_container_image + + cephadm_config, deps = self.mgr.cephadm_services[daemon_type_to_service(daemon_spec.daemon_type)].generate_config( + daemon_spec) + + # TCP port to open in the host firewall + if len(ports) > 0: + daemon_spec.extra_args.extend([ + '--tcp-ports', ' '.join(map(str, ports)) + ]) + + # osd deployments needs an --osd-uuid arg + if daemon_spec.daemon_type == 'osd': + if not osd_uuid_map: + osd_uuid_map = self.mgr.get_osd_uuid_map() + osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) + if not osd_uuid: + raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) + daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) + + if reconfig: + daemon_spec.extra_args.append('--reconfig') + if self.mgr.allow_ptrace: + daemon_spec.extra_args.append('--allow-ptrace') + + if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url: + self._registry_login(daemon_spec.host, self.mgr.registry_url, + self.mgr.registry_username, self.mgr.registry_password) - if daemon_spec.daemon_type == 'haproxy': - haspec = cast(HA_RGWSpec, daemon_spec.spec) - if haspec.haproxy_container_image: - image = haspec.haproxy_container_image - - if daemon_spec.daemon_type == 'keepalived': - haspec = cast(HA_RGWSpec, daemon_spec.spec) - if haspec.keepalived_container_image: - image = haspec.keepalived_container_image - - cephadm_config, deps = self.mgr.cephadm_services[daemon_type_to_service(daemon_spec.daemon_type)].generate_config( - daemon_spec) - - # TCP port to open in the host firewall - if len(ports) > 0: - daemon_spec.extra_args.extend([ - '--tcp-ports', ' '.join(map(str, ports)) - ]) - - # osd deployments needs an --osd-uuid arg - if daemon_spec.daemon_type == 'osd': - if not osd_uuid_map: - osd_uuid_map = self.mgr.get_osd_uuid_map() - osd_uuid = osd_uuid_map.get(daemon_spec.daemon_id) - if not osd_uuid: - raise OrchestratorError('osd.%s not in osdmap' % daemon_spec.daemon_id) - daemon_spec.extra_args.extend(['--osd-fsid', osd_uuid]) - - if reconfig: - daemon_spec.extra_args.append('--reconfig') - if self.mgr.allow_ptrace: - daemon_spec.extra_args.append('--allow-ptrace') - - if self.mgr.cache.host_needs_registry_login(daemon_spec.host) and self.mgr.registry_url: - self._registry_login(daemon_spec.host, self.mgr.registry_url, - self.mgr.registry_username, self.mgr.registry_password) - - daemon_spec.extra_args.extend(['--config-json', '-']) - - self.log.info('%s daemon %s on %s' % ( - 'Reconfiguring' if reconfig else 'Deploying', - daemon_spec.name(), daemon_spec.host)) - - out, err, code = self._run_cephadm( - daemon_spec.host, daemon_spec.name(), 'deploy', - [ - '--name', daemon_spec.name(), - ] + daemon_spec.extra_args, - stdin=json.dumps(cephadm_config), - image=image) - if not code and daemon_spec.host in self.mgr.cache.daemons: - # prime cached service state with what we (should have) - # just created - sd = orchestrator.DaemonDescription() - sd.daemon_type = daemon_spec.daemon_type - sd.daemon_id = daemon_spec.daemon_id - sd.hostname = daemon_spec.host - sd.status = 1 - sd.status_desc = 'starting' - self.mgr.cache.add_daemon(daemon_spec.host, sd) - if daemon_spec.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager']: - self.mgr.requires_post_actions.add(daemon_spec.daemon_type) - self.mgr.cache.invalidate_host_daemons(daemon_spec.host) - self.mgr.cache.update_daemon_config_deps( - daemon_spec.host, daemon_spec.name(), deps, start_time) - self.mgr.cache.save_host(daemon_spec.host) - msg = "{} {} on host '{}'".format( - 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) - if not code: - self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) - else: - what = 'reconfigure' if reconfig else 'deploy' - self.mgr.events.for_daemon( - daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') - return msg + daemon_spec.extra_args.extend(['--config-json', '-']) + + self.log.info('%s daemon %s on %s' % ( + 'Reconfiguring' if reconfig else 'Deploying', + daemon_spec.name(), daemon_spec.host)) + + out, err, code = self._run_cephadm( + daemon_spec.host, daemon_spec.name(), 'deploy', + [ + '--name', daemon_spec.name(), + ] + daemon_spec.extra_args, + stdin=json.dumps(cephadm_config), + image=image) + if not code and daemon_spec.host in self.mgr.cache.daemons: + # prime cached service state with what we (should have) + # just created + sd = daemon_spec.to_daemon_description(1, 'starting') + self.mgr.cache.add_daemon(daemon_spec.host, sd) + if daemon_spec.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager']: + self.mgr.requires_post_actions.add(daemon_spec.daemon_type) + self.mgr.cache.invalidate_host_daemons(daemon_spec.host) + self.mgr.cache.update_daemon_config_deps( + daemon_spec.host, daemon_spec.name(), deps, start_time) + self.mgr.cache.save_host(daemon_spec.host) + msg = "{} {} on host '{}'".format( + 'Reconfigured' if reconfig else 'Deployed', daemon_spec.name(), daemon_spec.host) + if not code: + self.mgr.events.for_daemon(daemon_spec.name(), OrchestratorEvent.INFO, msg) + else: + what = 'reconfigure' if reconfig else 'deploy' + self.mgr.events.for_daemon( + daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') + return msg + except OrchestratorError: + if not reconfig: + # we have to clean up the daemon. E.g. keyrings. + servict_type = daemon_type_to_service(daemon_spec.daemon_type) + dd = daemon_spec.to_daemon_description(-1, 'failed') + self.mgr.cephadm_services[servict_type].post_remove(dd) + raise def _remove_daemon(self, name: str, host: str) -> str: """ diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index da298b3e1563..0ca15a1c78c7 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -79,6 +79,15 @@ class CephadmDaemonSpec(Generic[ServiceSpecs]): return files + def to_daemon_description(self, status: int, status_desc: str) -> DaemonDescription: + return DaemonDescription( + daemon_type=self.daemon_type, + daemon_id=self.daemon_id, + hostname=self.host, + status=status, + status_desc=status_desc + ) + class CephadmService(metaclass=ABCMeta): """ @@ -355,7 +364,7 @@ class CephService(CephadmService): entity = self.get_auth_entity(daemon_id, host=host) logger.info(f'Remove keyring: {entity}') - ret, out, err = self.mgr.check_mon_command({ + ret, out, err = self.mgr.mon_command({ 'prefix': 'auth rm', 'entity': entity, }) diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 4fc21ebc63cc..79e344b138ba 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -585,6 +585,22 @@ class TestCephadm(object): with with_daemon(cephadm_module, spec, meth, 'test'): pass + @mock.patch("cephadm.serve.CephadmServe._run_cephadm") + def test_daemon_add_fail(self, _run_cephadm, cephadm_module): + _run_cephadm.return_value = '{}', '', 0 + with with_host(cephadm_module, 'test'): + spec = ServiceSpec( + service_type='mgr', + placement=PlacementSpec(hosts=[HostPlacementSpec('test', '', 'x')], count=1) + ) + _run_cephadm.side_effect = OrchestratorError('fail') + with pytest.raises(OrchestratorError): + wait(cephadm_module, cephadm_module.add_mgr(spec)) + cephadm_module.assert_issued_mon_command({ + 'prefix': 'auth rm', + 'entity': 'mgr.x', + }) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @mock.patch("cephadm.module.CephadmOrchestrator.rados", mock.MagicMock()) def test_nfs(self, cephadm_module): diff --git a/src/pybind/mgr/tests/__init__.py b/src/pybind/mgr/tests/__init__.py index cee20003f4f6..dd19aac4e8d5 100644 --- a/src/pybind/mgr/tests/__init__.py +++ b/src/pybind/mgr/tests/__init__.py @@ -99,7 +99,9 @@ if 'UNITTEST' in os.environ: return self.mock_store_get('_ceph_get', data_name, mock.MagicMock()) def _ceph_send_command(self, res, svc_type, svc_id, command, tag, inbuf): + cmd = json.loads(command) + getattr(self, '_mon_commands_sent', []).append(cmd) # Mocking the config store is handy sometimes: def config_get(): @@ -139,6 +141,9 @@ if 'UNITTEST' in os.environ: res.complete(0, outb, '') + def assert_issued_mon_command(self, command): + assert command in self._mon_commands_sent, self._mon_commands_sent + @property def _logger(self): return logging.getLogger(__name__) @@ -148,6 +153,7 @@ if 'UNITTEST' in os.environ: pass def __init__(self, *args): + self._mon_commands_sent = [] if not hasattr(self, '_store'): self._store = {}