From 737f30f12c4764a2e5c76ef6d8b40068b07543a9 Mon Sep 17 00:00:00 2001 From: kginon Date: Tue, 3 Mar 2026 18:39:34 +0200 Subject: [PATCH] mgr/cephadm: return IP addresses in prometheus_sd_config Align prometheus service-discovery to return IP addresses for the manager target, following the same pattern used for alertmanager and node-exporter. This avoids discovery failures in environments where the Prometheus host cannot resolve the FQDNs of the Ceph cluster nodes. Fixes: https://tracker.ceph.com/issues/74658 Signed-off-by: Kobi Ginon --- .../mgr/cephadm/services/service_discovery.py | 13 +++++---- .../cephadm/tests/test_service_discovery.py | 29 +++++++++++++++++-- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/pybind/mgr/cephadm/services/service_discovery.py b/src/pybind/mgr/cephadm/services/service_discovery.py index e874f90fe2ac..1fef54ceb7fd 100644 --- a/src/pybind/mgr/cephadm/services/service_discovery.py +++ b/src/pybind/mgr/cephadm/services/service_discovery.py @@ -169,11 +169,14 @@ class Root: """ targets = [] mgr_daemons = self.mgr.cache.get_daemons_by_service('mgr') - host = service_registry.get_service('mgr').get_active_daemon(mgr_daemons).hostname or '' - fqdn = self.mgr.get_fqdn(host) - port = self.mgr.get_module_option_ex( - 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) - targets.append(f'{fqdn}:{port}') + dd = service_registry.get_service('mgr').get_active_daemon(mgr_daemons) + if dd: + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port_val = self.mgr.get_module_option_ex( + 'prometheus', 'server_port', PrometheusService.DEFAULT_MGR_PROMETHEUS_PORT) + port: Optional[int] = int(port_val) if port_val is not None else None + targets.append(build_url(host=addr, port=port).lstrip('/')) return [{"targets": targets, "labels": {}}] def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]: diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index ab86a6829dc5..3db1a06d8f26 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -1,4 +1,5 @@ from unittest.mock import MagicMock +from typing import List from cephadm.services.service_discovery import Root from cephadm.services.service_registry import service_registry @@ -128,7 +129,10 @@ class FakeMgr: return -1, '', 'error' def get_module_option_ex(self, module, option, default_value): - return "9283" + # Port value for prometheus server_port (used by prometheus_sd_config). + if module == 'prometheus' and option == 'server_port': + return 9283 + return default_value def daemon_is_self(self, d_type, d_id) -> bool: if d_type == 'mgr' and d_id == 'fake_active_mgr': @@ -152,8 +156,27 @@ class TestServiceDiscovery: assert 'labels' in entry assert 'targets' in entry - # check content - assert cfg[0]['targets'] == ['node0:9283'] + # check content - should return IP address instead of hostname + assert cfg[0]['targets'] == ['1.2.3.4:9283'] + + def test_get_sd_config_prometheus_uses_ip_when_hostname_unresolvable(self, monkeypatch): + mgr = FakeMgr() + + def _mgr_daemons_no_ip(service_type: str) -> List[FakeDaemonDescription]: + if service_type == 'mgr': + return [ + FakeDaemonDescription(None, [9922], 'node0', daemon_type='mgr', daemon_id='fake_active_mgr'), + FakeDaemonDescription('1.2.3.5', [9922], 'node1', daemon_type='mgr', daemon_id='fake_standby_mgr'), + ] + return FakeCache().get_daemons_by_service(service_type) + + monkeypatch.setattr(mgr.cache, 'get_daemons_by_service', _mgr_daemons_no_ip) + + root = Root(mgr) + cfg = root.get_sd_config('mgr-prometheus') + + # Expect fallback to inventory IP (FakeInventory returns 1.2.3.4) + assert cfg == [{"targets": ["1.2.3.4:9283"], "labels": {}}] def test_get_sd_config_node_exporter(self): mgr = FakeMgr() -- 2.47.3