From 1280f0114dcd9f23c2b5149826b5b9d30555bc15 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Tue, 14 Jan 2025 10:34:27 +0100 Subject: [PATCH] mgr/cephadm: re-factoring the dependencies calculation code currently, the dependency logic is duplicated between the different Service classes and the module.py::_calc_daemon_deps function, which can lead to issues such as BUGSs, difficulty in maintenance, and other problems associated with duplicated code. In this change, we are consolidating all the dependency logic into the Service subclasses to eliminate this duplication. This way, we also force anybody creating a new Service to think about its potential dependencies. Fixes: https://tracker.ceph.com/issues/69021 Signed-off-by: Redouane Kachach --- src/pybind/mgr/cephadm/inventory.py | 9 +- src/pybind/mgr/cephadm/module.py | 123 +----------------- src/pybind/mgr/cephadm/serve.py | 4 +- .../mgr/cephadm/services/cephadmservice.py | 36 ++++- src/pybind/mgr/cephadm/services/ingress.py | 44 ++++++- src/pybind/mgr/cephadm/services/iscsi.py | 41 ++++-- src/pybind/mgr/cephadm/services/jaeger.py | 24 +++- .../mgr/cephadm/services/mgmt_gateway.py | 10 +- src/pybind/mgr/cephadm/services/monitoring.py | 121 ++++++++++------- src/pybind/mgr/cephadm/services/node_proxy.py | 21 ++- src/pybind/mgr/cephadm/tests/test_services.py | 2 +- 11 files changed, 225 insertions(+), 210 deletions(-) diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 550604fc55b..d67c5a14abb 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -1338,11 +1338,16 @@ class HostCache(): def get_daemons_by_type(self, service_type: str, host: str = '') -> List[orchestrator.DaemonDescription]: assert service_type not in ['keepalived', 'haproxy'] - daemons = self.daemons[host].values() if host else self._get_daemons() - return [d for d in daemons if d.daemon_type in service_to_daemon_types(service_type)] + def get_daemons_by_types(self, daemon_types: List[str]) -> List[str]: + daemon_names = [] + for daemon_type in daemon_types: + for dd in self.get_daemons_by_type(daemon_type): + daemon_names.append(dd.name()) + return daemon_names + def get_daemon_types(self, hostname: str) -> Set[str]: """Provide a list of the types of daemons on the host""" return cast(Set[str], {d.daemon_type for d in self.daemons[hostname].values()}) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 6690153d435..afa715bb912 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -36,7 +36,7 @@ from ceph.deployment.drive_group import DriveGroupSpec from ceph.deployment.service_spec import \ ServiceSpec, PlacementSpec, \ HostPlacementSpec, IngressSpec, \ - TunedProfileSpec, IscsiServiceSpec, \ + TunedProfileSpec, \ MgmtGatewaySpec from ceph.utils import str_to_datetime, datetime_to_str, datetime_now from cephadm.serve import CephadmServe @@ -2931,124 +2931,9 @@ Then run the following: spec: Optional[ServiceSpec], daemon_type: str, daemon_id: str) -> List[str]: - - def get_daemon_names(daemons: List[str]) -> List[str]: - daemon_names = [] - for daemon_type in daemons: - for dd in self.cache.get_daemons_by_type(daemon_type): - daemon_names.append(dd.name()) - return daemon_names - - prom_cred_hash = None - alertmgr_cred_hash = None - security_enabled, mgmt_gw_enabled, _ = self._get_security_config() - if security_enabled: - alertmanager_user, alertmanager_password = self._get_alertmanager_credentials() - prometheus_user, prometheus_password = self._get_prometheus_credentials() - if prometheus_user and prometheus_password: - prom_cred_hash = f'{utils.md5_hash(prometheus_user + prometheus_password)}' - if alertmanager_user and alertmanager_password: - alertmgr_cred_hash = f'{utils.md5_hash(alertmanager_user + alertmanager_password)}' - - deps = [] - if daemon_type == 'haproxy': - # because cephadm creates new daemon instances whenever - # port or ip changes, identifying daemons by name is - # sufficient to detect changes. - if not spec: - return [] - ingress_spec = cast(IngressSpec, spec) - assert ingress_spec.backend_service - daemons = self.cache.get_daemons_by_service(ingress_spec.backend_service) - deps = [d.name() for d in daemons] - elif daemon_type == 'keepalived': - # because cephadm creates new daemon instances whenever - # port or ip changes, identifying daemons by name is - # sufficient to detect changes. - if not spec: - return [] - daemons = self.cache.get_daemons_by_service(spec.service_name()) - deps = [d.name() for d in daemons if d.daemon_type == 'haproxy'] - elif daemon_type == 'agent': - root_cert = '' - server_port = '' - try: - server_port = str(self.http_server.agent.server_port) - root_cert = self.cert_mgr.get_root_ca() - except Exception: - pass - deps = sorted([self.get_mgr_ip(), server_port, root_cert, - str(self.device_enhanced_scan)]) - elif daemon_type == 'node-proxy': - root_cert = '' - server_port = '' - try: - server_port = str(self.http_server.agent.server_port) - root_cert = self.cert_mgr.get_root_ca() - except Exception: - pass - deps = sorted([self.get_mgr_ip(), server_port, root_cert]) - elif daemon_type == 'iscsi': - if spec: - iscsi_spec = cast(IscsiServiceSpec, spec) - deps = [self.iscsi_service.get_trusted_ips(iscsi_spec)] - else: - deps = [self.get_mgr_ip()] - elif daemon_type == 'prometheus': - if not mgmt_gw_enabled: - # for prometheus we add the active mgr as an explicit dependency, - # this way we force a redeploy after a mgr failover - deps.append(self.get_active_mgr().name()) - deps.append(str(self.get_module_option_ex('prometheus', 'server_port', 9283))) - deps.append(str(self.service_discovery_port)) - # prometheus yaml configuration file (generated by prometheus.yml.j2) contains - # a scrape_configs section for each service type. This should be included only - # when at least one daemon of the corresponding service is running. Therefore, - # an explicit dependency is added for each service-type to force a reconfig - # whenever the number of daemons for those service-type changes from 0 to greater - # than zero and vice versa. - deps += [s for s in ['node-exporter', 'alertmanager'] - if self.cache.get_daemons_by_service(s)] - if len(self.cache.get_daemons_by_type('ingress')) > 0: - deps.append('ingress') - # add dependency on ceph-exporter daemons - deps += [d.name() for d in self.cache.get_daemons_by_service('ceph-exporter')] - deps += [d.name() for d in self.cache.get_daemons_by_service('mgmt-gateway')] - deps += [d.name() for d in self.cache.get_daemons_by_service('oauth2-proxy')] - if prom_cred_hash is not None: - deps.append(prom_cred_hash) - if alertmgr_cred_hash is not None: - deps.append(alertmgr_cred_hash) - elif daemon_type == 'grafana': - deps += get_daemon_names(['prometheus', 'loki', 'mgmt-gateway', 'oauth2-proxy']) - if prom_cred_hash is not None: - deps.append(prom_cred_hash) - elif daemon_type == 'alertmanager': - deps += get_daemon_names(['alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy']) - if not mgmt_gw_enabled: - deps += get_daemon_names(['mgr']) - if alertmgr_cred_hash is not None: - deps.append(alertmgr_cred_hash) - elif daemon_type == 'promtail': - deps += get_daemon_names(['loki']) - elif daemon_type in ['ceph-exporter', 'node-exporter']: - deps += get_daemon_names(['mgmt-gateway']) - elif daemon_type == JaegerAgentService.TYPE: - for dd in self.cache.get_daemons_by_type(JaegerCollectorService.TYPE): - assert dd.hostname is not None - port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT - deps.append(build_url(host=dd.hostname, port=port).lstrip('/')) - deps = sorted(deps) - elif daemon_type == 'mgmt-gateway': - deps = MgmtGatewayService.get_dependencies(self) - else: - # this daemon type doesn't need deps mgmt - pass - - if daemon_type in ['prometheus', 'node-exporter', 'alertmanager', 'grafana', - 'ceph-exporter']: - deps.append(f'secure_monitoring_stack:{self.secure_monitoring_stack}') - + svc_type = daemon_type_to_service(daemon_type) + svc_cls = self.cephadm_services.get(svc_type, None) + deps = svc_cls.get_dependencies(self, spec, daemon_type) if svc_cls else [] return sorted(deps) @forall_hosts diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 8e9cd00fa81..87b3a1df851 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1129,8 +1129,8 @@ class CephadmServe: dd.name())) action = 'reconfig' elif last_deps != deps: - self.log.debug(f'{dd.name()} deps {last_deps} -> {deps}') - self.log.info(f'Reconfiguring {dd.name()} (dependencies changed)...') + sym_diff = set(deps).symmetric_difference(last_deps) + self.log.info(f'Reconfiguring {dd.name()} deps {last_deps} -> {deps} (diff {sym_diff})') action = 'reconfig' # we need only redeploy if secure_monitoring_stack or mgmt-gateway value has changed: # TODO(redo): check if we should just go always with redeploy (it's fast enough) diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 4f83d7bb0fb..607e1bd9019 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -265,6 +265,12 @@ class CephadmService(metaclass=ABCMeta): def TYPE(self) -> str: pass + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + return [] + def __init__(self, mgr: "CephadmOrchestrator"): self.mgr: "CephadmOrchestrator" = mgr @@ -576,6 +582,7 @@ class CephadmService(metaclass=ABCMeta): class CephService(CephadmService): + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: # Ceph.daemons (mon, mgr, mds, osd, etc) cephadm_config = self.get_config_and_keyring( @@ -1301,6 +1308,15 @@ class CephExporterService(CephService): TYPE = 'ceph-exporter' DEFAULT_SERVICE_PORT = 9926 + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + + deps = [f'secure_monitoring_stack:{mgr.secure_monitoring_stack}'] + deps += mgr.cache.get_daemons_by_types(['mgmt-gateway']) + return sorted(deps) + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type spec = cast(CephExporterSpec, self.mgr.spec_store[daemon_spec.service_name].spec) @@ -1330,11 +1346,7 @@ class CephExporterService(CephService): daemon_spec.keyring = keyring daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) daemon_spec.final_config = merge_dicts(daemon_spec.final_config, exporter_config) - - deps = [] - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('mgmt-gateway')] - deps += [f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}'] - daemon_spec.deps = deps + daemon_spec.deps = self.get_dependencies(self.mgr) return daemon_spec @@ -1379,6 +1391,20 @@ class CephfsMirrorService(CephService): class CephadmAgent(CephService): TYPE = 'agent' + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + agent = mgr.http_server.agent + return sorted( + [ + str(mgr.get_mgr_ip()), + str(agent.server_port), + mgr.cert_mgr.get_root_ca(), + str(mgr.get_module_option("device_enhanced_scan")), + ] + ) + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type daemon_id, host = daemon_spec.daemon_id, daemon_spec.host diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py index 7381ef67d7e..442458f2711 100644 --- a/src/pybind/mgr/cephadm/services/ingress.py +++ b/src/pybind/mgr/cephadm/services/ingress.py @@ -2,7 +2,7 @@ import ipaddress import logging import random import string -from typing import List, Dict, Any, Tuple, cast, Optional +from typing import List, Dict, Any, Tuple, cast, Optional, TYPE_CHECKING from ceph.deployment.service_spec import ServiceSpec, IngressSpec from mgr_util import build_url @@ -10,6 +10,9 @@ from cephadm import utils from orchestrator import OrchestratorError, DaemonDescription from cephadm.services.cephadmservice import CephadmDaemonDeploySpec, CephService +if TYPE_CHECKING: + from ..module import CephadmOrchestrator + logger = logging.getLogger(__name__) @@ -17,6 +20,16 @@ class IngressService(CephService): TYPE = 'ingress' MAX_KEEPALIVED_PASS_LEN = 8 + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + if daemon_type == 'haproxy': + return IngressService.get_haproxy_dependencies(mgr, spec) + elif daemon_type == 'keepalived': + return IngressService.get_keepalived_dependencies(mgr, spec) + return [] + def primary_daemon_type(self, spec: Optional[ServiceSpec] = None) -> str: if spec: ispec = cast(IngressSpec, spec) @@ -75,6 +88,18 @@ class IngressService(CephService): return daemon_spec + @staticmethod + def get_haproxy_dependencies(mgr: "CephadmOrchestrator", spec: Optional[ServiceSpec]) -> List[str]: + # because cephadm creates new daemon instances whenever + # port or ip changes, identifying daemons by name is + # sufficient to detect changes. + if not spec: + return [] + ingress_spec = cast(IngressSpec, spec) + assert ingress_spec.backend_service + daemons = mgr.cache.get_daemons_by_service(ingress_spec.backend_service) + return sorted([d.name() for d in daemons]) + def haproxy_generate_config( self, daemon_spec: CephadmDaemonDeploySpec, @@ -86,7 +111,6 @@ class IngressService(CephService): f'{spec.service_name()} backend service {spec.backend_service} does not exist') backend_spec = self.mgr.spec_store[spec.backend_service].spec daemons = self.mgr.cache.get_daemons_by_service(spec.backend_service) - deps = [d.name() for d in daemons] # generate password? pw_key = f'{spec.service_name()}/monitor_password' @@ -201,7 +225,7 @@ class IngressService(CephService): ssl_cert = '\n'.join(ssl_cert) config_files['files']['haproxy.pem'] = ssl_cert - return config_files, sorted(deps) + return config_files, self.get_haproxy_dependencies(self.mgr, spec) def keepalived_prepare_create( self, @@ -220,6 +244,16 @@ class IngressService(CephService): return daemon_spec + @staticmethod + def get_keepalived_dependencies(mgr: "CephadmOrchestrator", spec: Optional[ServiceSpec]) -> List[str]: + # because cephadm creates new daemon instances whenever + # port or ip changes, identifying daemons by name is + # sufficient to detect changes. + if not spec: + return [] + daemons = mgr.cache.get_daemons_by_service(spec.service_name()) + return sorted([d.name() for d in daemons if d.daemon_type == 'haproxy']) + def keepalived_generate_config( self, daemon_spec: CephadmDaemonDeploySpec, @@ -252,8 +286,6 @@ class IngressService(CephService): raise OrchestratorError( f'Failed to generate keepalived.conf: No daemons deployed for {spec.service_name()}') - deps = sorted([d.name() for d in daemons if d.daemon_type == 'haproxy']) - host = daemon_spec.host hosts = sorted(list(set([host] + [str(d.hostname) for d in daemons]))) @@ -394,4 +426,4 @@ class IngressService(CephService): } } - return config_file, deps + return config_file, self.get_keepalived_dependencies(self.mgr, spec) diff --git a/src/pybind/mgr/cephadm/services/iscsi.py b/src/pybind/mgr/cephadm/services/iscsi.py index d2773c19d36..963a845ec74 100644 --- a/src/pybind/mgr/cephadm/services/iscsi.py +++ b/src/pybind/mgr/cephadm/services/iscsi.py @@ -2,19 +2,33 @@ import errno import json import logging import subprocess -from typing import List, cast, Optional +from typing import List, cast, Optional, TYPE_CHECKING from ipaddress import ip_address, IPv6Address from mgr_module import HandleCommandResult -from ceph.deployment.service_spec import IscsiServiceSpec +from ceph.deployment.service_spec import IscsiServiceSpec, ServiceSpec from orchestrator import DaemonDescription, DaemonDescriptionStatus from .cephadmservice import CephadmDaemonDeploySpec, CephService from .. import utils +if TYPE_CHECKING: + from ..module import CephadmOrchestrator + logger = logging.getLogger(__name__) +def get_trusted_ips(mgr: "CephadmOrchestrator", spec: IscsiServiceSpec) -> str: + # add active mgr ip address to trusted list so dashboard can access + trusted_ip_list = spec.trusted_ip_list if spec.trusted_ip_list else '' + mgr_ip = mgr.get_mgr_ip() + if mgr_ip not in [s.strip() for s in trusted_ip_list.split(',')]: + if trusted_ip_list: + trusted_ip_list += ',' + trusted_ip_list += mgr_ip + return trusted_ip_list + + class IscsiService(CephService): TYPE = 'iscsi' @@ -23,15 +37,15 @@ class IscsiService(CephService): assert spec.pool self.mgr._check_pool_exists(spec.pool, spec.service_name()) - def get_trusted_ips(self, spec: IscsiServiceSpec) -> str: - # add active mgr ip address to trusted list so dashboard can access - trusted_ip_list = spec.trusted_ip_list if spec.trusted_ip_list else '' - mgr_ip = self.mgr.get_mgr_ip() - if mgr_ip not in [s.strip() for s in trusted_ip_list.split(',')]: - if trusted_ip_list: - trusted_ip_list += ',' - trusted_ip_list += mgr_ip - return trusted_ip_list + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + if spec: + iscsi_spec = cast(IscsiServiceSpec, spec) + return [get_trusted_ips(mgr, iscsi_spec)] + else: + return [mgr.get_mgr_ip()] def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type @@ -68,8 +82,7 @@ class IscsiService(CephService): 'val': key_data, }) - trusted_ip_list = self.get_trusted_ips(spec) - + trusted_ip_list = get_trusted_ips(self.mgr, spec) context = { 'client_name': '{}.{}'.format(utils.name_to_config_section('iscsi'), igw_id), 'trusted_ip_list': trusted_ip_list, @@ -80,7 +93,7 @@ class IscsiService(CephService): daemon_spec.keyring = keyring daemon_spec.extra_files = {'iscsi-gateway.cfg': igw_conf} daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) - daemon_spec.deps = [trusted_ip_list] + daemon_spec.deps = self.get_dependencies(self.mgr, spec) return daemon_spec def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None: diff --git a/src/pybind/mgr/cephadm/services/jaeger.py b/src/pybind/mgr/cephadm/services/jaeger.py index c83c765d039..6c415512eef 100644 --- a/src/pybind/mgr/cephadm/services/jaeger.py +++ b/src/pybind/mgr/cephadm/services/jaeger.py @@ -1,8 +1,11 @@ -from typing import List, cast +from typing import List, cast, Optional, TYPE_CHECKING from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec -from ceph.deployment.service_spec import TracingSpec +from ceph.deployment.service_spec import TracingSpec, ServiceSpec from mgr_util import build_url +if TYPE_CHECKING: + from ..module import CephadmOrchestrator + class ElasticSearchService(CephadmService): TYPE = 'elasticsearch' @@ -17,19 +20,30 @@ class JaegerAgentService(CephadmService): TYPE = 'jaeger-agent' DEFAULT_SERVICE_PORT = 6799 + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + deps = [] # type: List[str] + for dd in mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE): + # scrape jaeger-collector nodes + assert dd.hostname is not None + port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT + url = build_url(host=dd.hostname, port=port).lstrip('/') + deps.append(url) + return sorted(deps) + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type collectors = [] - deps: List[str] = [] for dd in self.mgr.cache.get_daemons_by_type(JaegerCollectorService.TYPE): # scrape jaeger-collector nodes assert dd.hostname is not None port = dd.ports[0] if dd.ports else JaegerCollectorService.DEFAULT_SERVICE_PORT url = build_url(host=dd.hostname, port=port).lstrip('/') collectors.append(url) - deps.append(url) daemon_spec.final_config = {'collector_nodes': ",".join(collectors)} - daemon_spec.deps = sorted(deps) + daemon_spec.deps = self.get_dependencies(self.mgr) return daemon_spec diff --git a/src/pybind/mgr/cephadm/services/mgmt_gateway.py b/src/pybind/mgr/cephadm/services/mgmt_gateway.py index 0897ce99ff7..0706a9ad177 100644 --- a/src/pybind/mgr/cephadm/services/mgmt_gateway.py +++ b/src/pybind/mgr/cephadm/services/mgmt_gateway.py @@ -1,8 +1,8 @@ import logging -from typing import List, Any, Tuple, Dict, cast, TYPE_CHECKING +from typing import List, Any, Tuple, Dict, cast, Optional, TYPE_CHECKING from orchestrator import DaemonDescription -from ceph.deployment.service_spec import MgmtGatewaySpec, GrafanaSpec +from ceph.deployment.service_spec import MgmtGatewaySpec, GrafanaSpec, ServiceSpec from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec, get_dashboard_endpoints if TYPE_CHECKING: @@ -83,8 +83,10 @@ class MgmtGatewayService(CephadmService): sd_endpoints.append(f"{addr}:{self.mgr.service_discovery_port}") return sd_endpoints - @staticmethod - def get_dependencies(mgr: "CephadmOrchestrator") -> List[str]: + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: # url_prefix for the following services depends on the presence of mgmt-gateway deps = [ f'{d.name()}:{d.ports[0]}' if d.ports else d.name() diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 9c5b5a112f3..bd0620f595f 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -2,7 +2,7 @@ import errno import logging import os import socket -from typing import List, Any, Tuple, Dict, Optional, cast +from typing import List, Any, Tuple, Dict, Optional, cast, TYPE_CHECKING import ipaddress from mgr_module import HandleCommandResult @@ -15,6 +15,9 @@ from mgr_util import verify_tls, ServerConfigException, build_url, get_cert_issu from ceph.deployment.utils import wrap_ipv6 from .. import utils +if TYPE_CHECKING: + from ..module import CephadmOrchestrator + logger = logging.getLogger(__name__) @@ -85,22 +88,26 @@ class GrafanaService(CephadmService): 'mgmt_gw_ips': ','.join(mgmt_gw_ips), }) - def calculate_grafana_deps(self, security_enabled: bool) -> List[str]: + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: deps = [] # type: List[str] - deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') + security_enabled, mgmt_gw_enabled, _ = mgr._get_security_config() + deps.append(f'secure_monitoring_stack:{mgr.secure_monitoring_stack}') # in case security is enabled we have to reconfig when prom user/pass changes - prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() + prometheus_user, prometheus_password = mgr._get_prometheus_credentials() if security_enabled and prometheus_user and prometheus_password: deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}') # adding a dependency for mgmt-gateway because the usage of url_prefix relies on its presence. # another dependency is added for oauth-proxy as Grafana login is delegated to this service when enabled. for service in ['prometheus', 'loki', 'mgmt-gateway', 'oauth2-proxy']: - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service(service)] + deps += [d.name() for d in mgr.cache.get_daemons_by_service(service)] - return deps + return sorted(deps) def generate_prom_services(self, security_enabled: bool, mgmt_gw_enabled: bool) -> List[str]: @@ -134,7 +141,6 @@ class GrafanaService(CephadmService): cert, pkey = self.prepare_certificates(daemon_spec) security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config() - deps = self.calculate_grafana_deps(security_enabled) grafana_ini = self.generate_grafana_ini(daemon_spec, mgmt_gw_enabled, oauth2_enabled) grafana_data_sources = self.generate_data_sources(security_enabled, mgmt_gw_enabled, cert, pkey) # the path of the grafana dashboards are assumed from the providers.yml.j2 file by grafana @@ -166,7 +172,7 @@ class GrafanaService(CephadmService): dashboard = f.read() config_file['files'][f'/etc/grafana/provisioning/dashboards/{file_name}'] = dashboard - return config_file, sorted(deps) + return config_file, self.get_dependencies(self.mgr) def prepare_certificates(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]: cert = self.mgr.cert_key_store.get_cert('grafana_cert', host=daemon_spec.host) @@ -297,9 +303,27 @@ class AlertmanagerService(CephadmService): cert, key = self.mgr.cert_mgr.generate_cert([host_fqdn, "alertmanager_servers"], node_ip) return cert, key + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + deps = [] + deps.append(f'secure_monitoring_stack:{mgr.secure_monitoring_stack}') + deps = deps + mgr.cache.get_daemons_by_types(['alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy']) + security_enabled, mgmt_gw_enabled, _ = mgr._get_security_config() + if security_enabled: + alertmanager_user, alertmanager_password = mgr._get_alertmanager_credentials() + if alertmanager_user and alertmanager_password: + alertmgr_cred_hash = f'{utils.md5_hash(alertmanager_user + alertmanager_password)}' + deps.append(alertmgr_cred_hash) + + if not mgmt_gw_enabled: + deps += mgr.cache.get_daemons_by_types(['mgr']) + + return sorted(deps) + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type - deps: List[str] = [] default_webhook_urls: List[str] = [] spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec) @@ -312,31 +336,17 @@ class AlertmanagerService(CephadmService): user_data['default_webhook_urls'], list): default_webhook_urls.extend(user_data['default_webhook_urls']) - # add a dependency since url_prefix depends on the existence of mgmt-gateway - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('mgmt-gateway')] - # add a dependency since enbling basic-auth (or not) depends on the existence of 'oauth2-proxy' - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')] - security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config() if mgmt_gw_enabled: dashboard_urls = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}/dashboard'] else: dashboard_urls = get_dashboard_urls(self) - # scan all mgrs to generate deps and to get standbys too. - for dd in self.mgr.cache.get_daemons_by_service('mgr'): - # we consider mgr a dep even if the dashboard is disabled - # in order to be consistent with _calc_daemon_deps(). - # when mgmt_gw is enabled there's no need for mgr dep as - # mgmt-gw wil route to the active mgr automatically - deps.append(dd.name()) snmp_gateway_urls: List[str] = [] for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'): assert dd.hostname is not None assert dd.ports addr = dd.ip if dd.ip else self.mgr.get_fqdn(dd.hostname) - deps.append(dd.name()) - snmp_gateway_urls.append(build_url(scheme='http', host=addr, port=dd.ports[0], path='/alerts')) @@ -353,7 +363,6 @@ class AlertmanagerService(CephadmService): port = 9094 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'): assert dd.hostname is not None - deps.append(dd.name()) addr = self.mgr.get_fqdn(dd.hostname) peers.append(build_url(host=addr, port=port).lstrip('/')) @@ -364,11 +373,9 @@ class AlertmanagerService(CephadmService): if ip_to_bind_to: daemon_spec.port_ips = {str(port): ip_to_bind_to} - deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') + deps = self.get_dependencies(self.mgr) if security_enabled: alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() - if alertmanager_user and alertmanager_password: - deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}') cert, key = self.get_alertmanager_certificates(daemon_spec) context = { 'enable_mtls': mgmt_gw_enabled, @@ -388,7 +395,7 @@ class AlertmanagerService(CephadmService): 'web_config': '/etc/alertmanager/web.yml', 'use_url_prefix': mgmt_gw_enabled, 'ip_to_bind_to': ip_to_bind_to - }, sorted(deps) + }, deps else: return { "files": { @@ -397,7 +404,7 @@ class AlertmanagerService(CephadmService): "peers": peers, 'use_url_prefix': mgmt_gw_enabled, 'ip_to_bind_to': ip_to_bind_to - }, sorted(deps) + }, deps def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: # TODO: if there are multiple daemons, who is the active one? @@ -632,42 +639,46 @@ class PrometheusService(CephadmService): r['files']['/etc/prometheus/alerting/custom_alerts.yml'] = \ self.mgr.get_store('services/prometheus/alerting/custom_alerts.yml', '') - return r, sorted(self.calculate_deps()) + return r, self.get_dependencies(self.mgr) - def calculate_deps(self) -> List[str]: + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: deps = [] # type: List[str] - port = cast(int, self.mgr.get_module_option_ex('prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT)) + port = cast(int, mgr.get_module_option_ex('prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)) deps.append(str(port)) - deps.append(str(self.mgr.service_discovery_port)) - deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') - security_enabled, mgmt_gw_enabled, _ = self.mgr._get_security_config() + deps.append(str(mgr.service_discovery_port)) + deps.append(f'secure_monitoring_stack:{mgr.secure_monitoring_stack}') + security_enabled, mgmt_gw_enabled, _ = mgr._get_security_config() if not mgmt_gw_enabled: # add an explicit dependency on the active manager. This will force to # re-deploy prometheus if the mgr has changed (due to a fail-over i.e). # when mgmt_gw is enabled there's no need for such dep as mgmt-gw wil # route to the active mgr automatically - deps.append(self.mgr.get_active_mgr().name()) + deps.append(mgr.get_active_mgr().name()) if security_enabled: - alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() - prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() + alertmanager_user, alertmanager_password = mgr._get_alertmanager_credentials() + prometheus_user, prometheus_password = mgr._get_prometheus_credentials() if prometheus_user and prometheus_password: deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}') if alertmanager_user and alertmanager_password: deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}') # add a dependency since url_prefix depends on the existence of mgmt-gateway - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('mgmt-gateway')] + deps += [d.name() for d in mgr.cache.get_daemons_by_service('mgmt-gateway')] # add a dependency since enbling basic-auth (or not) depends on the existence of 'oauth2-proxy' - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')] + deps += [d.name() for d in mgr.cache.get_daemons_by_service('oauth2-proxy')] # add dependency on ceph-exporter daemons - deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('ceph-exporter')] - deps += [s for s in ['node-exporter', 'alertmanager'] if self.mgr.cache.get_daemons_by_service(s)] - if len(self.mgr.cache.get_daemons_by_type('ingress')) > 0: + deps += [d.name() for d in mgr.cache.get_daemons_by_service('ceph-exporter')] + deps += [s for s in ['node-exporter', 'alertmanager'] if mgr.cache.get_daemons_by_service(s)] + if len(mgr.cache.get_daemons_by_type('ingress')) > 0: deps.append('ingress') - return deps + + return sorted(deps) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: # TODO: if there are multiple daemons, who is the active one? @@ -725,6 +736,15 @@ class NodeExporterService(CephadmService): TYPE = 'node-exporter' DEFAULT_SERVICE_PORT = 9100 + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + deps = [] + deps.append(f'secure_monitoring_stack:{mgr.secure_monitoring_stack}') + deps = deps + mgr.cache.get_daemons_by_types(['mgmt-gateway']) + return sorted(deps) + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) @@ -794,6 +814,12 @@ class PromtailService(CephadmService): TYPE = 'promtail' DEFAULT_SERVICE_PORT = 9080 + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + return sorted(mgr.cache.get_daemons_by_types(['loki'])) + def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec: assert self.TYPE == daemon_spec.daemon_type daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) @@ -801,8 +827,7 @@ class PromtailService(CephadmService): def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: assert self.TYPE == daemon_spec.daemon_type - deps: List[str] = [] - + deps: List[str] = self.get_dependencies(self.mgr) daemons = self.mgr.cache.get_daemons_by_service('loki') loki_host = '' for i, dd in enumerate(daemons): @@ -810,8 +835,6 @@ class PromtailService(CephadmService): if i == 0: loki_host = dd.ip if dd.ip else self.mgr.get_fqdn(dd.hostname) - deps.append(dd.name()) - context = { 'client_hostname': loki_host, } @@ -821,7 +844,7 @@ class PromtailService(CephadmService): "files": { "promtail.yml": yml } - }, sorted(deps) + }, deps class SNMPGatewayService(CephadmService): diff --git a/src/pybind/mgr/cephadm/services/node_proxy.py b/src/pybind/mgr/cephadm/services/node_proxy.py index 00849da20e3..8ad230b6342 100644 --- a/src/pybind/mgr/cephadm/services/node_proxy.py +++ b/src/pybind/mgr/cephadm/services/node_proxy.py @@ -3,13 +3,16 @@ import ssl import base64 from urllib.error import HTTPError, URLError -from typing import List, Any, Dict, Tuple, Optional, MutableMapping +from typing import List, Any, Dict, Tuple, Optional, MutableMapping, TYPE_CHECKING from .cephadmservice import CephadmDaemonDeploySpec, CephService from ceph.deployment.service_spec import ServiceSpec, PlacementSpec from ceph.utils import http_req from orchestrator import OrchestratorError +if TYPE_CHECKING: + from ..module import CephadmOrchestrator + class NodeProxy(CephService): TYPE = 'node-proxy' @@ -29,6 +32,19 @@ class NodeProxy(CephService): return daemon_spec + @classmethod + def get_dependencies(cls, mgr: "CephadmOrchestrator", + spec: Optional[ServiceSpec] = None, + daemon_type: Optional[str] = None) -> List[str]: + root_cert = '' + server_port = '' + try: + server_port = str(mgr.http_server.agent.server_port) + root_cert = mgr.cert_mgr.get_root_ca() + except Exception: + pass + return sorted([mgr.get_mgr_ip(), server_port, root_cert]) + def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: # node-proxy is re-using the agent endpoint and therefore # needs similar checks to see if the endpoint is ready. @@ -52,8 +68,7 @@ class NodeProxy(CephService): } config = {'node-proxy.json': json.dumps(cfg)} - return config, sorted([str(self.mgr.get_mgr_ip()), str(self.agent_endpoint.server_port), - self.mgr.cert_mgr.get_root_ca()]) + return config, self.get_dependencies(self.mgr) def handle_hw_monitoring_setting(self) -> bool: # function to apply or remove node-proxy service spec depending diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index d872219df80..edb0f887188 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -284,7 +284,7 @@ class TestISCSIService: @patch("cephadm.serve.CephadmServe._run_cephadm") @patch("cephadm.module.CephadmOrchestrator.get_unique_name") - @patch("cephadm.services.iscsi.IscsiService.get_trusted_ips") + @patch("cephadm.services.iscsi.get_trusted_ips") def test_iscsi_config(self, _get_trusted_ips, _get_name, _run_cephadm, cephadm_module: CephadmOrchestrator): iscsi_daemon_id = 'testpool.test.qwert' -- 2.39.5