]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: don't use list_servers to get active mgr host for prometheus SD config
authorAdam King <adking@redhat.com>
Wed, 30 Jul 2025 19:51:11 +0000 (15:51 -0400)
committerAdam King <adking@redhat.com>
Tue, 16 Sep 2025 21:07:15 +0000 (17:07 -0400)
Having a lot of calls into list_servers causes issues with
the core ceph mgr on large clusters. Additionally, we were
using it purely to get the active mgr's host here, which
cephadm should be able to do without needing a mgr api call

Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 726bb5a95de7857c220953a1ed26ed3263213c6f)

src/pybind/mgr/cephadm/service_discovery.py
src/pybind/mgr/cephadm/tests/test_service_discovery.py

index cddddcd3b088d2630e7d2171410acbf2e9684975..6f5889ec080794e625f1ca9fef869ff3063e518f 100644 (file)
@@ -9,7 +9,6 @@ except ImportError:
 import logging
 
 import orchestrator  # noqa
-from mgr_module import ServiceInfoT
 from mgr_util import build_url
 from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO
 from cephadm.services.nfs import NFSService
@@ -21,6 +20,7 @@ import tempfile
 from cephadm.services.ingress import IngressSpec
 from cephadm.services.cephadmservice import CephExporterService
 from cephadm.services.nvmeof import NvmeofService
+from cephadm.services.service_registry import service_registry
 
 from ceph.deployment.service_spec import SMBSpec
 
@@ -183,17 +183,16 @@ class Root(Server):
             return []
 
     def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]:
-        """Return <http_sd_config> compatible prometheus config for prometheus service."""
-        servers = self.mgr.list_servers()
+        """Return <http_sd_config> compatible prometheus config for prometheus service.
+        Targets should be a length one list containing only the active mgr
+        """
         targets = []
-        for server in servers:
-            hostname = server.get('hostname', '')
-            for service in cast(List[ServiceInfoT], server.get('services', [])):
-                if service['type'] != 'mgr' or service['id'] != self.mgr.get_mgr_id():
-                    continue
-                port = self.mgr.get_module_option_ex(
-                    'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
-                targets.append(f'{hostname}:{port}')
+        mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr')
+        host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or ''
+        fqdn = self.mgr.get_fqdn(host)
+        port = self.mgr.get_module_option_ex(
+            'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT)
+        targets.append(f'{fqdn}:{port}')
         return [{"targets": targets, "labels": {}}]
 
     def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]:
index 48ce131b839a5cd8aa7756f1c20769e2e7590ed9..a2f14d0e1c12fa4d2b237226e4fc9836ad41621f 100644 (file)
@@ -1,14 +1,16 @@
 from unittest.mock import MagicMock
 from cephadm.service_discovery import Root
+from cephadm.services.service_registry import service_registry
 
 
 class FakeDaemonDescription:
-    def __init__(self, ip, ports, hostname, service_name='', daemon_type=''):
+    def __init__(self, ip, ports, hostname, service_name='', daemon_type='', daemon_id=''):
         self.ip = ip
         self.ports = ports
         self.hostname = hostname
         self._service_name = service_name
         self.daemon_type = daemon_type
+        self.daemon_id = daemon_id if daemon_id else hostname
 
     def service_name(self):
         return self._service_name
@@ -35,12 +37,20 @@ class FakeCache:
             return [FakeDaemonDescription('1.2.3.4', [9123], 'node0'),
                     FakeDaemonDescription('1.2.3.5', [9123], 'node1')]
 
+        if service_type == 'mgr':
+            return [FakeDaemonDescription('1.2.3.4', [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'),
+                    FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr')]
+
         return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
                 FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
 
     def get_daemons_by_type(self, daemon_type):
-        return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'),
-                FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')]
+        if daemon_type == 'ingress':
+            return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'),
+                    FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')]
+        else:
+            return [FakeDaemonDescription('1.2.3.4', [1234], 'node0', daemon_type, daemon_type),
+                    FakeDaemonDescription('1.2.3.5', [1234], 'node1', daemon_type, daemon_type)]
 
 
 class FakeInventory:
@@ -84,23 +94,11 @@ class FakeMgr:
         self.inventory = FakeInventory()
         self.cache = FakeCache()
         self.spec_store = FakeSpecStore(self)
+        service_registry.init_services(self)
 
     def get_mgr_id(self):
         return 'mgr-1'
 
-    def list_servers(self):
-
-        servers = [
-            {'hostname': 'node0',
-             'ceph_version': '16.2',
-             'services': [{'type': 'mgr', 'id': 'mgr-1'}, {'type': 'mon'}]},
-            {'hostname': 'node1',
-             'ceph_version': '16.2',
-             'services': [{'type': 'mgr', 'id': 'mgr-2'}, {'type': 'mon'}]}
-        ]
-
-        return servers
-
     def _check_mon_command(self, cmd_dict, inbuf=None):
         prefix = cmd_dict.get('prefix')
         if prefix == 'get-cmd':
@@ -113,6 +111,14 @@ class FakeMgr:
     def get_module_option_ex(self, module, option, default_value):
         return "9283"
 
+    def daemon_is_self(self, d_type, d_id) -> bool:
+        if d_type == 'mgr' and d_id == 'fake_active_mgr':
+            return True
+        return False
+
+    def get_fqdn(self, hostname: str) -> str:
+        return hostname
+
 
 class TestServiceDiscovery: