From: Redouane Kachach Date: Fri, 14 Nov 2025 12:06:59 +0000 (+0100) Subject: mgr/cephadm: don't remove TLS certs if svc still has daemons on host X-Git-Tag: testing/wip-pdonnell-testing-20260323.122957-tentacle~362 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a9ee38bff2b79ebf12e9894690705036936aad46;p=ceph-ci.git mgr/cephadm: don't remove TLS certs if svc still has daemons on host This change fixes an issue in cephadm where cephadm-signed (and inline) TLS certificates could be removed while a service was still running on the same host. During a rolling transition from HTTP to HTTPS (e.g. RGW moving from port 80 to 443 with ssl: true), the previous post_remove() logic deletes the service’s cephadm-signed cert/key as soon as any daemon is removed, even if a new HTTPS daemon for the same service is being deployed on that host. In practice this leads to the certificate being created for the new daemon and then immediately deleted from the certmgr store. The new behavior makes post_remove() more conservative: before removing a cephadm-signed or inline certificate, it checks whether there are any remaining daemons for that service on the same host. If there are, the cert/key is left in place because it may still be in use (for example during an HTTP->HTTPS rollout). Certificates are only cleaned up once the last daemon for that service disappears from the host (and, when the service no longer uses SSL). This preserves correct TLS behavior during service transitions while still ensuring certificates are eventually garbage-collected when no longer needed. Fixes: https://tracker.ceph.com/issues/73853 Resolves: rhbz#2408795 Signed-off-by: Redouane Kachach --- diff --git a/src/pybind/mgr/cephadm/cert_mgr.py b/src/pybind/mgr/cephadm/cert_mgr.py index 8c6c671ed1e..b636f3ddce9 100644 --- a/src/pybind/mgr/cephadm/cert_mgr.py +++ b/src/pybind/mgr/cephadm/cert_mgr.py @@ -324,8 +324,13 @@ class CertMgr: return cert_obj is not None def is_cert_editable(self, cert_name: str, service_name: Optional[str] = None, host: Optional[str] = None) -> bool: - cert_obj = cast(Cert, self.cert_store.get_tlsobject(cert_name, service_name, host)) - return cert_obj.editable if cert_obj else True + # By definition a certificate which doesn't not exist it's considered + # editable so user can populate its content by using a custom value. + if self.cert_store.tlsobject_exists(cert_name): + cert_obj = cast(Cert, self.cert_store.get_tlsobject(cert_name, service_name, host)) + return cert_obj.editable if cert_obj else True + else: + return True def get_cert(self, cert_name: str, service_name: Optional[str] = None, host: Optional[str] = None) -> Optional[str]: cert_obj = cast(Cert, self.cert_store.get_tlsobject(cert_name, service_name, host)) @@ -365,6 +370,30 @@ class CertMgr: self.rm_cert(self.self_signed_cert(service_name, label), service_name, host) self.rm_key(self.self_signed_key(service_name, label), service_name, host) + def rm_cert_if_present(self, cert_name: str, service_name: Optional[str] = None, host: Optional[str] = None) -> bool: + try: + return self.rm_cert(cert_name, service_name, host) + except TLSObjectException: + logger.debug( + "TLS cert %s for service=%s host=%s not found; nothing to remove", + cert_name, service_name, host, + ) + return False + + def rm_key_if_present(self, key_name: str, service_name: Optional[str] = None, host: Optional[str] = None) -> bool: + try: + return self.rm_key(key_name, service_name, host) + except TLSObjectException: + logger.debug( + "TLS key %s for service=%s host=%s not found; nothing to remove", + key_name, service_name, host, + ) + return False + + def rm_self_signed_cert_key_pair_if_present(self, service_name: str, host: str, label: Optional[str] = None) -> None: + self.rm_cert_if_present(self.self_signed_cert(service_name, label), service_name, host) + self.rm_key_if_present(self.self_signed_key(service_name, label), service_name, host) + def cert_ls(self, filter_by: str = '', include_details: bool = False, include_cephadm_signed: bool = False) -> Dict: diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index c681a55595d..f372d89cf79 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -799,6 +799,35 @@ class CephadmService(metaclass=ABCMeta): """ Called after the daemon is removed. """ + + def _cleanup_tls_creds_for_host(svc_name: str, host: str, cert_source: Optional[str]) -> None: + + if cert_source == CertificateSource.CEPHADM_SIGNED.value: + logger.info(f"Removing cephadm-signed certificate/key for service: {svc_name}, host: {host}") + self.mgr.cert_mgr.rm_self_signed_cert_key_pair(svc_name, host) + elif cert_source == CertificateSource.INLINE.value: + logger.info(f"Removing inline-saved certificate/key for service: {svc_name}, host: {host}") + self.mgr.cert_mgr.rm_cert(self.cert_name, svc_name, host) + self.mgr.cert_mgr.rm_key(self.key_name, svc_name, host) + if self.ca_cert_name: + self.mgr.cert_mgr.rm_cert(self.ca_cert_name, svc_name, host) + elif cert_source == CertificateSource.REFERENCE.value: + # It's a reference cert/key to the certmgr so we must keep them as user may want to use them later + logger.info(f"Keeping referenced certificate/key for service: {svc_name}, host: {host}") + elif cert_source is None: + # This could happen when TLS is being disabled in spec by the user, so we lose the value + # of the certificate source but we still have to do our best to cleanup the TLS credentials. + logger.info(f"Removing certificate/key for service: {svc_name}, host: {host}") + self.mgr.cert_mgr.rm_self_signed_cert_key_pair_if_present(svc_name, host) + # try to remove inline certs (if any) + if not self.mgr.cert_mgr.is_cert_editable(self.cert_name, svc_name, host): + self.mgr.cert_mgr.rm_cert_if_present(self.cert_name, svc_name, host) + self.mgr.cert_mgr.rm_key_if_present(self.key_name, svc_name, host) + if self.ca_cert_name: + self.mgr.cert_mgr.rm_cert_if_present(self.ca_cert_name, svc_name, host) + else: + logger.error(f"Unknown cert-source {cert_source}. Cannot remove cert/key for: {svc_name}, host: {host}") + assert daemon.daemon_type is not None assert daemon.hostname assert self.TYPE == daemon_type_to_service(daemon.daemon_type) @@ -810,24 +839,27 @@ class CephadmService(metaclass=ABCMeta): if svc_name not in self.mgr.spec_store: return + host = daemon.hostname spec = self.mgr.spec_store[svc_name].spec if not spec.ssl: + # Service no longer uses TLS; safe to clean up TLS credentials for this host + _cleanup_tls_creds_for_host(svc_name, host, spec.certificate_source) return - host = daemon.hostname - cert_source = spec.certificate_source - if cert_source == CertificateSource.CEPHADM_SIGNED.value: - logger.info(f"Removing cephadm-signed certificate/key for service: {svc_name}, host: {host}") - self.mgr.cert_mgr.rm_self_signed_cert_key_pair(svc_name, host) - elif cert_source == CertificateSource.INLINE.value: - logger.info(f"Removing inline-saved certificate/key for service: {svc_name}, host: {host}") - self.mgr.cert_mgr.rm_cert(self.cert_name, svc_name, host) - self.mgr.cert_mgr.rm_key(self.key_name, svc_name, host) - if self.ca_cert_name: - self.mgr.cert_mgr.rm_cert(self.ca_cert_name, svc_name, host) + remaining = [ + d for d in self.mgr.cache.get_daemons_by_service(svc_name) + if d.hostname == host + ] + if remaining: + # The service still has daemons running on this host (e.g. during an HTTP -> HTTPS + # transition the daemons running on :80 are removed but new daemons running on :443 + # area created), so the TLS cert/key may still be in use. Do not remove it yet. + logger.info( + "Not removing TLS cert/key for %s on %s: still %d daemons present", + svc_name, host, len(remaining) + ) else: - # It's a reference cert/key to the certmgr so we must keep them as user may want to use them later - logger.info(f"Keeping referenced certificate/key for service: {svc_name}, host: {host}") + _cleanup_tls_creds_for_host(svc_name, host, spec.certificate_source) def purge(self, service_name: str) -> None: """Called to carry out any purge tasks following service removal""" @@ -911,7 +943,7 @@ class CephService(CephadmService): entity = self.get_auth_entity(daemon_id, host=host) - logger.info(f'Removing key for {entity}') + logger.info(f'Removing keyring for {entity}') ret, out, err = self.mgr.mon_command({ 'prefix': 'auth rm', 'entity': entity, diff --git a/src/pybind/mgr/cephadm/tlsobject_store.py b/src/pybind/mgr/cephadm/tlsobject_store.py index 1a166c786fb..d621be96ad9 100644 --- a/src/pybind/mgr/cephadm/tlsobject_store.py +++ b/src/pybind/mgr/cephadm/tlsobject_store.py @@ -206,6 +206,9 @@ class TLSObjectStore(): raise TLSObjectException(f'Attempted to remove {self.tlsobject_class.__name__.lower()} for unknown obj_name {obj_name}') return False + def tlsobject_exists(self, obj_name: str) -> bool: + return obj_name in self.objects_by_name + def _validate_tlsobject_name(self, obj_name: str, service_name: Optional[str] = None, host: Optional[str] = None) -> None: cred_type = self.tlsobject_class.__name__.lower() if obj_name not in self.objects_by_name: