From 52c640cee914d2dc6f59e24c9f4c5749542847e0 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Wed, 11 Jun 2025 12:30:46 +0200 Subject: [PATCH] mgr/cepahdm: adapting agent service to use the new cert mgmt Signed-off-by: Redouane Kachach --- src/pybind/mgr/cephadm/agent.py | 56 ++++++++++++------- .../mgr/cephadm/services/service_discovery.py | 2 +- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index 11b33427ec7..1c103789201 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -17,12 +17,14 @@ from orchestrator import DaemonDescriptionStatus from orchestrator._interface import daemon_type_to_service from ceph.utils import datetime_now, http_req from ceph.deployment.inventory import Devices -from ceph.deployment.service_spec import ServiceSpec, PlacementSpec +from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, CertificateSource from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from mgr_util import test_port_allocation, PortAlreadyInUse from mgr_util import verify_tls_files import tempfile from cephadm.services.service_registry import service_registry +from cephadm.services.cephadmservice import CephadmAgent +from cephadm.tlsobject_types import CertKeyPair from urllib.error import HTTPError, URLError from typing import Any, Dict, List, Set, TYPE_CHECKING, Optional, MutableMapping, IO @@ -43,6 +45,9 @@ logging.getLogger('cherrypy.error').addFilter(cherrypy_filter) cherrypy.log.access_log.propagate = False +CEPHADM_AGENT_CERT_DURATION = (365 * 5) + + class AgentEndpoint: def __init__(self, mgr: "CephadmOrchestrator") -> None: @@ -59,20 +64,27 @@ class AgentEndpoint: cherrypy.tree.mount(self.node_proxy_endpoint, '/node-proxy', config=conf) def configure_tls(self, server: Server) -> None: - addr = self.mgr.get_mgr_ip() - host = self.mgr.get_hostname() - cert, key = self.mgr.cert_mgr.generate_cert(host, addr) + self.mgr.cert_mgr.register_self_signed_cert_key_pair(CephadmAgent.TYPE) + tls_pair = self._get_agent_certificates() self.cert_file = tempfile.NamedTemporaryFile() - self.cert_file.write(cert.encode('utf-8')) + self.cert_file.write(tls_pair.cert.encode('utf-8')) self.cert_file.flush() # cert_tmp must not be gc'ed self.key_file = tempfile.NamedTemporaryFile() - self.key_file.write(key.encode('utf-8')) + self.key_file.write(tls_pair.key.encode('utf-8')) self.key_file.flush() # pkey_tmp must not be gc'ed verify_tls_files(self.cert_file.name, self.key_file.name) server.ssl_certificate, server.ssl_private_key = self.cert_file.name, self.key_file.name + def _get_agent_certificates(self) -> CertKeyPair: + host = self.mgr.get_hostname() + tls_pair = self.mgr.cert_mgr.get_self_signed_cert_key_pair(CephadmAgent.TYPE, host) + if not tls_pair: + tls_pair = self.mgr.cert_mgr.generate_cert(host, self.mgr.get_mgr_ip(), duration_in_days=CEPHADM_AGENT_CERT_DURATION) + self.mgr.cert_mgr.save_self_signed_cert_key_pair(CephadmAgent.TYPE, tls_pair, host=host) + return tls_pair + def find_free_port(self) -> None: max_port = self.server_port + 150 while self.server_port <= max_port: @@ -782,15 +794,15 @@ class AgentMessageThread(threading.Thread): root_cert_tmp.flush() root_cert_fname = root_cert_tmp.name - cert, key = self.mgr.cert_mgr.generate_cert(self.mgr.get_hostname(), self.mgr.get_mgr_ip()) + tls_pair = self.mgr.cert_mgr.generate_cert(self.mgr.get_hostname(), self.mgr.get_mgr_ip(), duration_in_days=CEPHADM_AGENT_CERT_DURATION) cert_tmp = tempfile.NamedTemporaryFile() - cert_tmp.write(cert.encode('utf-8')) + cert_tmp.write(tls_pair.cert.encode('utf-8')) cert_tmp.flush() cert_fname = cert_tmp.name key_tmp = tempfile.NamedTemporaryFile() - key_tmp.write(key.encode('utf-8')) + key_tmp.write(tls_pair.key.encode('utf-8')) key_tmp.flush() key_fname = key_tmp.name @@ -877,7 +889,7 @@ class CephadmAgentHelpers: if self.mgr.cache.is_host_draining(host): return False # if we haven't deployed an agent on the host yet, don't say an agent is down - if not self.mgr.cache.get_daemons_by_type('agent', host=host): + if not self.mgr.cache.get_daemons_by_type(CephadmAgent.TYPE, host=host): return False # if we don't have a timestamp, it's likely because of a mgr fail over. # just set the timestamp to now. However, if host was offline before, we @@ -902,7 +914,7 @@ class CephadmAgentHelpers: detail.append((f'Cephadm agent on host {agent} has not reported in ' f'{down_mult * self.mgr.agent_refresh_rate} seconds. Agent is assumed ' 'down and host may be offline.')) - for dd in [d for d in self.mgr.cache.get_daemons_by_type('agent') if d.hostname in down_agent_hosts]: + for dd in [d for d in self.mgr.cache.get_daemons_by_type(CephadmAgent.TYPE) if d.hostname in down_agent_hosts]: dd.status = DaemonDescriptionStatus.error self.mgr.set_health_warning( 'CEPHADM_AGENT_DOWN', @@ -918,8 +930,10 @@ class CephadmAgentHelpers: # daemons, OR we can put this in its own function then mock the function def _apply_agent(self) -> None: spec = ServiceSpec( - service_type='agent', - placement=PlacementSpec(host_pattern='*') + service_type=CephadmAgent.TYPE, + placement=PlacementSpec(host_pattern='*'), + ssl=True, + certificate_source=CertificateSource.CEPHADM_SIGNED.value, ) self.mgr.spec_store.save(spec) @@ -930,17 +944,17 @@ class CephadmAgentHelpers: # when we turned the config option off, we need to redeploy them # we can tell they're in that state if we don't have a keyring for # them in the host cache - for agent in self.mgr.cache.get_daemons_by_service('agent'): + for agent in self.mgr.cache.get_daemons_by_service(CephadmAgent.TYPE): if agent.hostname not in self.mgr.agent_cache.agent_keys: self.mgr._schedule_daemon_action(agent.name(), 'redeploy') - if 'agent' not in self.mgr.spec_store: + if CephadmAgent.TYPE not in self.mgr.spec_store: self.mgr.agent_helpers._apply_agent() need_apply = True else: - if 'agent' in self.mgr.spec_store: - self.mgr.spec_store.rm('agent') + if CephadmAgent.TYPE in self.mgr.spec_store: + self.mgr.spec_store.rm(CephadmAgent.TYPE) need_apply = True - if not self.mgr.cache.get_daemons_by_service('agent'): + if not self.mgr.cache.get_daemons_by_service(CephadmAgent.TYPE): self.mgr.agent_cache.agent_counter = {} self.mgr.agent_cache.agent_timestamp = {} self.mgr.agent_cache.agent_keys = {} @@ -959,7 +973,7 @@ class CephadmAgentHelpers: if self.mgr.agent_helpers._agent_down(host): down = True try: - agent = self.mgr.cache.get_daemons_by_type('agent', host=host)[0] + agent = self.mgr.cache.get_daemons_by_type(CephadmAgent.TYPE, host=host)[0] assert agent.daemon_id is not None assert agent.hostname is not None except Exception as e: @@ -967,8 +981,8 @@ class CephadmAgentHelpers: f'Could not retrieve agent on host {host} from daemon cache: {e}') return down try: - spec = self.mgr.spec_store.active_specs.get('agent', None) - deps = self.mgr._calc_daemon_deps(spec, 'agent', agent.daemon_id) + spec = self.mgr.spec_store.active_specs.get(CephadmAgent.TYPE, None) + deps = self.mgr._calc_daemon_deps(spec, CephadmAgent.TYPE, agent.daemon_id) last_deps, last_config = self.mgr.agent_cache.get_agent_last_config_deps(host) if not last_config or last_deps != deps: # if root cert is the dep that changed, we must use ssh to reconfig diff --git a/src/pybind/mgr/cephadm/services/service_discovery.py b/src/pybind/mgr/cephadm/services/service_discovery.py index c2381948dc2..45fddcac6c6 100644 --- a/src/pybind/mgr/cephadm/services/service_discovery.py +++ b/src/pybind/mgr/cephadm/services/service_discovery.py @@ -93,7 +93,7 @@ class ServiceDiscovery: def configure_tls(self, server: Server) -> None: addr = self.mgr.get_mgr_ip() host = self.mgr.get_hostname() - cert, key = self.mgr.cert_mgr.generate_cert(host, addr) + cert, key = self.mgr.cert_mgr.generate_cert(host, addr, duration_in_days = (365 * 5)) self.cert_file = tempfile.NamedTemporaryFile() self.cert_file.write(cert.encode('utf-8')) self.cert_file.flush() # cert_tmp must not be gc'ed -- 2.39.5