]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config 64763/head
authorAdam King <adking@redhat.com>
Wed, 30 Jul 2025 19:51:11 +0000 (15:51 -0400)
committerAdam King <adking@redhat.com>
Wed, 30 Jul 2025 19:51:11 +0000 (15:51 -0400)
Having a lot of calls into list_servers causes issues with
the core ceph mgr on large clusters. Additionally, we were
using it purely to get the active mgr's host here, which
cephadm should be able to do without needing a mgr api call

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/service_discovery.py
src/pybind/mgr/cephadm/tests/test_service_discovery.py

index ab92858df7cfc4ef24a728fe2474783265e5564f..d823f67bc8a811e534fd1c1a3a4cf68f02489b68 100644 (file)
@@ -9,7 +9,6 @@ except ImportError:
 import logging
 
 import orchestrator  # noqa
-from mgr_module import ServiceInfoT
 from mgr_util import build_url
 from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO
 from cephadm.services.nfs import NFSService
@@ -184,17 +183,16 @@ class Root(Server):
             return []
 
     def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]:
-        """Return <http_sd_config> compatible prometheus config for prometheus service."""
-        servers = self.mgr.list_servers()
+        """Return <http_sd_config> compatible prometheus config for prometheus service.
+        Targets should be a length one list containing only the active mgr
+        """
         targets = []
-        for server in servers:
-            hostname = server.get('hostname', '')
-            for service in cast(List[ServiceInfoT], server.get('services', [])):
-                if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id():
-                    continue
-                port = self.mgr.get_module_option_ex(
-                    'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
-                targets.append(f'{hostname}:{port}')
+        mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr')
+        host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or ''
+        fqdn = self.mgr.get_fqdn(host)
+        port = self.mgr.get_module_option_ex(
+            'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
+        targets.append(f'{fqdn}:{port}')
         return [{"targets": targets, "labels": {}}]
 
     def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]:
index 9dac31015135f6afc3c76582b119392f64fdd736..fd0eaff35b9a135d981d82c067a07d83cdb0d99d 100644 (file)
@@ -4,12 +4,13 @@ from cephadm.services.service_registry import service_registry
 
 
 class FakeDaemonDescription:
-    def __init__(self, ip, ports, hostname, service_name='', daemon_type=''):
+    def __init__(self, ip, ports, hostname, service_name='', daemon_type='', daemon_id=''):
         self.ip = ip
         self.ports = ports
         self.hostname = hostname
         self._service_name = service_name
         self.daemon_type = daemon_type
+        self.daemon_id = daemon_id if daemon_id else hostname
 
     def service_name(self):
         return self._service_name
@@ -36,6 +37,10 @@ class FakeCache:
             return [FakeDaemonDescription('1.2.3.4', [9123], 'node0'),
                     FakeDaemonDescription('1.2.3.5', [9123], 'node1')]
 
+        if service_type == 'mgr':
+            return [FakeDaemonDescription('1.2.3.4', [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'),
+                    FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr')]
+
         return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
                 FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
 
@@ -111,19 +116,6 @@ class FakeMgr:
     def get_mgr_id(self):
         return 'mgr-1'
 
-    def list_servers(self):
-
-        servers = [
-            {'hostname': 'node0',
-             'ceph_version': '16.2',
-             'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]},
-            {'hostname': 'node1',
-             'ceph_version': '16.2',
-             'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]}
-        ]
-
-        return servers
-
     def _check_mon_command(self, cmd_dict, inbuf=None):
         prefix = cmd_dict.get('prefix')
         if prefix == 'get-cmd':
@@ -136,6 +128,14 @@ class FakeMgr:
     def get_module_option_ex(self, module, option, default_value):
         return "9283"
 
+    def daemon_is_self(self, d_type, d_id) -> bool:
+        if d_type == 'mgr' and d_id == 'fake_active_mgr':
+            return True
+        return False
+
+    def get_fqdn(self, hostname: str) -> str:
+        return hostname
+
 
 class TestServiceDiscovery: