From: Adam King Date: Wed, 30 Jul 2025 19:51:11 +0000 (-0400) Subject: mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=726bb5a95de7857c220953a1ed26ed3263213c6f;p=ceph.git mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config Having a lot of calls into list_servers causes issues with the core ceph mgr on large clusters. Additionally, we were using it purely to get the active mgr's host here, which cephadm should be able to do without needing a mgr api call Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index ab92858df7cfc..d823f67bc8a81 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -9,7 +9,6 @@ except ImportError: import logging import orchestrator # noqa -from mgr_module import ServiceInfoT from mgr_util import build_url from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO from cephadm.services.nfs import NFSService @@ -184,17 +183,16 @@ class Root(Server): return [] def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]: - """Return compatible prometheus config for prometheus service.""" - servers = self.mgr.list_servers() + """Return compatible prometheus config for prometheus service. + Targets should be a length one list containing only the active mgr + """ targets = [] - for server in servers: - hostname = server.get('hostname', '') - for service in cast(List[ServiceInfoT], server.get('services', [])): - if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id(): - continue - port = self.mgr.get_module_option_ex( - 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) - targets.append(f'{hostname}:{port}') + mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr') + host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or '' + fqdn = self.mgr.get_fqdn(host) + port = self.mgr.get_module_option_ex( + 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) + targets.append(f'{fqdn}:{port}') return [{"targets": targets, "labels": {}}] def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]: diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index 9dac31015135f..fd0eaff35b9a1 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -4,12 +4,13 @@ from cephadm.services.service_registry import service_registry class FakeDaemonDescription: - def __init__(self, ip, ports, hostname, service_name='', daemon_type=''): + def __init__(self, ip, ports, hostname, service_name='', daemon_type='', daemon_id=''): self.ip = ip self.ports = ports self.hostname = hostname self._service_name = service_name self.daemon_type = daemon_type + self.daemon_id = daemon_id if daemon_id else hostname def service_name(self): return self._service_name @@ -36,6 +37,10 @@ class FakeCache: return [FakeDaemonDescription('1.2.3.4', [9123], 'node0'), FakeDaemonDescription('1.2.3.5', [9123], 'node1')] + if service_type == 'mgr': + return [FakeDaemonDescription('1.2.3.4', [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'), + FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr')] + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), FakeDaemonDescription('1.2.3.5', [9200], 'node1')] @@ -111,19 +116,6 @@ class FakeMgr: def get_mgr_id(self): return 'mgr-1' - def list_servers(self): - - servers = [ - {'hostname': 'node0', - 'ceph_version': '16.2', - 'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]}, - {'hostname': 'node1', - 'ceph_version': '16.2', - 'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]} - ] - - return servers - def _check_mon_command(self, cmd_dict, inbuf=None): prefix = cmd_dict.get('prefix') if prefix == 'get-cmd': @@ -136,6 +128,14 @@ class FakeMgr: def get_module_option_ex(self, module, option, default_value): return "9283" + def daemon_is_self(self, d_type, d_id) -> bool: + if d_type == 'mgr' and d_id == 'fake_active_mgr': + return True + return False + + def get_fqdn(self, hostname: str) -> str: + return hostname + class TestServiceDiscovery: