From b184df73d0e362eb1afde0b7df6d5ff7fb27a4c1 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Wed, 20 Aug 2025 14:01:28 +0200 Subject: [PATCH] mgr/cepahdm: fixing nvmeof scraping Prometheus config generation Signed-off-by: Redouane Kachach --- src/pybind/mgr/cephadm/services/monitoring.py | 32 ++++++++------- src/pybind/mgr/cephadm/services/nvmeof.py | 9 +++- .../mgr/cephadm/services/service_registry.py | 1 - src/pybind/mgr/cephadm/tests/test_services.py | 41 +++++++++++++++++++ 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index ad6d30fdc63..997d522ed9c 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -522,10 +522,16 @@ class PrometheusService(CephadmService): """ Retrieves the service discovery URLs for the services that require monitoring + Note: we always add the 'ceph' Prometheus target as it corresponds to the prometheus-mgr module target + Returns: Dict[str, List[str]]: A dictionary where the keys represent service categories (e.g., "nfs", "node-exporterr") and the values are a list of service-discovery URLs used to get the corresponding service targets. """ + + def sd_urls(svc: str, prefixes: List[str]) -> list[str]: + return [f'{p}/sd/prometheus/sd-config?service={svc}' for p in prefixes] + if mgmt_gw_enabled: service_discovery_url_prefixes = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}'] else: @@ -533,13 +539,13 @@ class PrometheusService(CephadmService): protocol = 'https' if security_enabled else 'http' service_discovery_url_prefixes = [f'{protocol}://{wrap_ipv6(ip)}:{port}' for ip in self.mgr._get_mgr_ips()] - return { - service: [f'{prefix}/sd/prometheus/sd-config?service={service}' for prefix in service_discovery_url_prefixes] - for service in service_registry.get_services_requiring_monitoring() - if service == 'ceph' - or bool(self.mgr.cache.get_daemons_by_service(service)) - or bool(self.mgr.cache.get_daemons_by_type(service)) - } + + services_to_monitor = ['ceph', *( + s for s in service_registry.get_services_requiring_monitoring() + if self.mgr.cache.get_daemons_by_service(s) or self.mgr.cache.get_daemons_by_type(s) + )] + + return {s: sd_urls(s, service_discovery_url_prefixes) for s in services_to_monitor} def configure_alerts(self, r: Dict) -> None: # include alerts, if present in the container @@ -666,14 +672,12 @@ class PrometheusService(CephadmService): deps.append(f'alert-cred:{utils.md5_hash(alertmanager_user + alertmanager_password)}') # Adding other services as deps (with corresponding justification): - # ceph-exporter: scraping target - # node-exporter: scraping target - # ingress : scraping target - # alert-manager: part of prometheus configuration - # mgmt-gateway : since url_prefix depends on the existence of mgmt-gateway + # mgmt-gateway : url_prefix depends on the existence of mgmt-gateway # oauth2-proxy : enbling basic-auth (or not) depends on the existence of 'oauth2-proxy' - for svc in ['mgmt-gateway', 'oauth2-proxy', 'alertmanager', 'node-exporter', 'ceph-exporter', 'ingress']: - deps.append(f'{svc}_configured:{bool(mgr.cache.get_daemons_by_service(svc))}') + prometheus_svc_deps = service_registry.get_services_requiring_monitoring() + ['mgmt-gateway', 'oauth2-proxy'] + for svc in prometheus_svc_deps: + configured = bool(mgr.cache.get_daemons_by_service(svc)) or bool(mgr.cache.get_daemons_by_type(svc)) + deps.append(f'{svc}_configured:{configured}') if not mgmt_gw_enabled: # Ceph mgrs are dependency because when mgmt-gateway is not enabled the service-discovery depends on mgrs ips diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index f08ba9462a0..405bc349977 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -158,9 +158,14 @@ class NvmeofService(CephService): def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None: def get_set_cmd_dicts(out: str) -> List[dict]: - gateways = json.loads(out)['gateways'] - cmd_dicts = [] + try: + gateways = json.loads(out).get('gateways', []) + except json.decoder.JSONDecodeError as e: + logger.error(f'Error while trying to parse gateways JSON: {e}') + return [] + + cmd_dicts = [] for dd in daemon_descrs: spec = cast(NvmeofServiceSpec, self.mgr.spec_store.all_specs.get(dd.service_name(), None)) diff --git a/src/pybind/mgr/cephadm/services/service_registry.py b/src/pybind/mgr/cephadm/services/service_registry.py index 10fb893a3e2..20879019fc9 100644 --- a/src/pybind/mgr/cephadm/services/service_registry.py +++ b/src/pybind/mgr/cephadm/services/service_registry.py @@ -69,7 +69,6 @@ class CephadmServiceRegistry: def get_services_requiring_monitoring(self) -> List[str]: """Return a list with service types that requiere monitoring.""" services_to_monitor = [svc for svc in self._services if self._services[svc].needs_monitoring] - services_to_monitor.append('ceph') # this is needed for mgr-prometheus targets return sorted(services_to_monitor) diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index ffa6537dcf4..72a10edddb3 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -1145,6 +1145,8 @@ class TestMonitoring: @patch("cephadm.module.CephadmOrchestrator._get_mgr_ips", lambda _: ['192.168.100.100', '::1']) def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator): _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) + pool = 'testpool' + group = 'mygroup' s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast') with with_host(cephadm_module, 'test'): # host "test" needs to have networks for keepalive to be placed @@ -1165,6 +1167,9 @@ class TestMonitoring: keepalived_password='12345', virtual_ip="1.2.3.4/32", backend_service='rgw.foo')) as _, \ + with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}', + group=group, + pool=pool)) as _, \ with_service(cephadm_module, PrometheusSpec('prometheus', networks=['1.2.3.0/24'], only_bind_port_on_networks=True)) as _: @@ -1231,6 +1236,16 @@ class TestMonitoring: - url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=node-exporter - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter + - job_name: 'nvmeof' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid + honor_labels: true + http_sd_configs: + - url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=nvmeof + - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + """).lstrip() @@ -1282,6 +1297,8 @@ class TestMonitoring: def test_prometheus_config_security_enabled(self, _run_cephadm, _get_uname, cephadm_module: CephadmOrchestrator): _run_cephadm.side_effect = async_side_effect(('{}', '', 0)) _get_uname.return_value = 'test' + pool = 'testpool' + group = 'mygroup' s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast') smb_spec = SMBSpec(cluster_id='foxtrot', config_uri='rados://.smb/foxtrot/config.json',) @@ -1312,6 +1329,9 @@ class TestMonitoring: keepalived_password='12345', virtual_ip="1.2.3.4/32", backend_service='rgw.foo')) as _, \ + with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}', + group=group, + pool=pool)) as _, \ with_service(cephadm_module, PrometheusSpec('prometheus')) as _: web_config = dedent(""" @@ -1442,6 +1462,27 @@ class TestMonitoring: cert_file: prometheus.crt key_file: prometheus.key + - job_name: 'nvmeof' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid + scheme: https + tls_config: + ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key + honor_labels: true + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + basic_auth: + username: sd_user + password: sd_password + tls_config: + ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key + - job_name: 'smb' relabel_configs: - source_labels: [__address__] -- 2.39.5