From: Adam King Date: Wed, 30 Jul 2025 19:51:11 +0000 (-0400) Subject: mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config X-Git-Tag: v20.1.1~17^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F64895%2Fhead;p=ceph.git mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config Having a lot of calls into list_servers causes issues with the core ceph mgr on large clusters. Additionally, we were using it purely to get the active mgr's host here, which cephadm should be able to do without needing a mgr api call Signed-off-by: Adam King (cherry picked from commit 726bb5a95de7857c220953a1ed26ed3263213c6f) --- diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index cddddcd3b088..6f5889ec0807 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -9,7 +9,6 @@ except ImportError: import logging import orchestrator # noqa -from mgr_module import ServiceInfoT from mgr_util import build_url from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO from cephadm.services.nfs import NFSService @@ -21,6 +20,7 @@ import tempfile from cephadm.services.ingress import IngressSpec from cephadm.services.cephadmservice import CephExporterService from cephadm.services.nvmeof import NvmeofService +from cephadm.services.service_registry import service_registry from ceph.deployment.service_spec import SMBSpec @@ -183,17 +183,16 @@ class Root(Server): return [] def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]: - """Return compatible prometheus config for prometheus service.""" - servers = self.mgr.list_servers() + """Return compatible prometheus config for prometheus service. + Targets should be a length one list containing only the active mgr + """ targets = [] - for server in servers: - hostname = server.get('hostname', '') - for service in cast(List[ServiceInfoT], server.get('services', [])): - if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id(): - continue - port = self.mgr.get_module_option_ex( - 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) - targets.append(f'{hostname}:{port}') + mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr') + host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or '' + fqdn = self.mgr.get_fqdn(host) + port = self.mgr.get_module_option_ex( + 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) + targets.append(f'{fqdn}:{port}') return [{"targets": targets, "labels": {}}] def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]: diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index 48ce131b839a..a2f14d0e1c12 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -1,14 +1,16 @@ from unittest.mock import MagicMock from cephadm.service_discovery import Root +from cephadm.services.service_registry import service_registry class FakeDaemonDescription: - def __init__(self, ip, ports, hostname, service_name='', daemon_type=''): + def __init__(self, ip, ports, hostname, service_name='', daemon_type='', daemon_id=''): self.ip = ip self.ports = ports self.hostname = hostname self._service_name = service_name self.daemon_type = daemon_type + self.daemon_id = daemon_id if daemon_id else hostname def service_name(self): return self._service_name @@ -35,12 +37,20 @@ class FakeCache: return [FakeDaemonDescription('1.2.3.4', [9123], 'node0'), FakeDaemonDescription('1.2.3.5', [9123], 'node1')] + if service_type == 'mgr': + return [FakeDaemonDescription('1.2.3.4', [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'), + FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr')] + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), FakeDaemonDescription('1.2.3.5', [9200], 'node1')] def get_daemons_by_type(self, daemon_type): - return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'), - FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')] + if daemon_type == 'ingress': + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'), + FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')] + else: + return [FakeDaemonDescription('1.2.3.4', [1234], 'node0', daemon_type, daemon_type), + FakeDaemonDescription('1.2.3.5', [1234], 'node1', daemon_type, daemon_type)] class FakeInventory: @@ -84,23 +94,11 @@ class FakeMgr: self.inventory = FakeInventory() self.cache = FakeCache() self.spec_store = FakeSpecStore(self) + service_registry.init_services(self) def get_mgr_id(self): return 'mgr-1' - def list_servers(self): - - servers = [ - {'hostname': 'node0', - 'ceph_version': '16.2', - 'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]}, - {'hostname': 'node1', - 'ceph_version': '16.2', - 'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]} - ] - - return servers - def _check_mon_command(self, cmd_dict, inbuf=None): prefix = cmd_dict.get('prefix') if prefix == 'get-cmd': @@ -113,6 +111,14 @@ class FakeMgr: def get_module_option_ex(self, module, option, default_value): return "9283" + def daemon_is_self(self, d_type, d_id) -> bool: + if d_type == 'mgr' and d_id == 'fake_active_mgr': + return True + return False + + def get_fqdn(self, hostname: str) -> str: + return hostname + class TestServiceDiscovery: