]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cepahdm: fixing nvmeof scraping Prometheus config generation 65150/head
authorRedouane Kachach <rkachach@ibm.com>
Wed, 20 Aug 2025 12:01:28 +0000 (14:01 +0200)
committerRedouane Kachach <rkachach@ibm.com>
Sat, 6 Sep 2025 21:40:36 +0000 (23:40 +0200)
Signed-off-by: Redouane Kachach <rkachach@ibm.com>
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/services/nvmeof.py
src/pybind/mgr/cephadm/services/service_registry.py
src/pybind/mgr/cephadm/tests/test_services.py

index ad6d30fdc636be2cb7a05fcda23979c201c38232..997d522ed9c1adf315168590c8aa0b4aeac7f1c4 100644 (file)
@@ -522,10 +522,16 @@ class PrometheusService(CephadmService):
         """
         Retrieves the service discovery URLs for the services that require monitoring
 
+        Note: we always add the 'ceph' Prometheus target as it corresponds to the prometheus-mgr module target
+
         Returns:
             Dict[str, List[str]]: A dictionary where the keys represent service categories (e.g., "nfs", "node-exporterr") and
                                   the values are a list of service-discovery URLs used to get the corresponding service targets.
         """
+
+        def sd_urls(svc: str, prefixes: List[str]) -> list[str]:
+            return [f'{p}/sd/prometheus/sd-config?service={svc}' for p in prefixes]
+
         if mgmt_gw_enabled:
             service_discovery_url_prefixes = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}']
         else:
@@ -533,13 +539,13 @@ class PrometheusService(CephadmService):
             protocol = 'https' if security_enabled else 'http'
             service_discovery_url_prefixes = [f'{protocol}://{wrap_ipv6(ip)}:{port}'
                                               for ip in self.mgr._get_mgr_ips()]
-        return {
-            service: [f'{prefix}/sd/prometheus/sd-config?service={service}' for prefix in service_discovery_url_prefixes]
-            for service in service_registry.get_services_requiring_monitoring()
-            if service == 'ceph'
-            or bool(self.mgr.cache.get_daemons_by_service(service))
-            or bool(self.mgr.cache.get_daemons_by_type(service))
-        }
+
+        services_to_monitor = ['ceph', *(
+            s for s in service_registry.get_services_requiring_monitoring()
+            if self.mgr.cache.get_daemons_by_service(s) or self.mgr.cache.get_daemons_by_type(s)
+        )]
+
+        return {s: sd_urls(s, service_discovery_url_prefixes) for s in services_to_monitor}
 
     def configure_alerts(self, r: Dict) -> None:
         # include alerts, if present in the container
@@ -666,14 +672,12 @@ class PrometheusService(CephadmService):
                 deps.append(f'alert-cred:{utils.md5_hash(alertmanager_user + alertmanager_password)}')
 
         # Adding other services as deps (with corresponding justification):
-        # ceph-exporter: scraping target
-        # node-exporter: scraping target
-        # ingress      : scraping target
-        # alert-manager: part of prometheus configuration
-        # mgmt-gateway : since url_prefix depends on the existence of mgmt-gateway
+        # mgmt-gateway : url_prefix depends on the existence of mgmt-gateway
         # oauth2-proxy : enbling basic-auth (or not) depends on the existence of 'oauth2-proxy'
-        for svc in ['mgmt-gateway', 'oauth2-proxy', 'alertmanager', 'node-exporter', 'ceph-exporter', 'ingress']:
-            deps.append(f'{svc}_configured:{bool(mgr.cache.get_daemons_by_service(svc))}')
+        prometheus_svc_deps = service_registry.get_services_requiring_monitoring() + ['mgmt-gateway', 'oauth2-proxy']
+        for svc in prometheus_svc_deps:
+            configured = bool(mgr.cache.get_daemons_by_service(svc)) or bool(mgr.cache.get_daemons_by_type(svc))
+            deps.append(f'{svc}_configured:{configured}')
 
         if not mgmt_gw_enabled:
             # Ceph mgrs are dependency because when mgmt-gateway is not enabled the service-discovery depends on mgrs ips
index f08ba9462a0704650e06c25c4fa68b49f53f69bd..405bc349977c0782fdaed1e5d3581bbc0e31b0ef 100644 (file)
@@ -158,9 +158,14 @@ class NvmeofService(CephService):
 
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
         def get_set_cmd_dicts(out: str) -> List[dict]:
-            gateways = json.loads(out)['gateways']
-            cmd_dicts = []
 
+            try:
+                gateways = json.loads(out).get('gateways', [])
+            except json.decoder.JSONDecodeError as e:
+                logger.error(f'Error while trying to parse gateways JSON: {e}')
+                return []
+
+            cmd_dicts = []
             for dd in daemon_descrs:
                 spec = cast(NvmeofServiceSpec,
                             self.mgr.spec_store.all_specs.get(dd.service_name(), None))
index 10fb893a3e2b549aab6e1b1f733223037be75bfc..20879019fc96af35ae933d60606b919c1c7bf6db 100644 (file)
@@ -69,7 +69,6 @@ class CephadmServiceRegistry:
     def get_services_requiring_monitoring(self) -> List[str]:
         """Return a list with service types that requiere monitoring."""
         services_to_monitor = [svc for svc in self._services if self._services[svc].needs_monitoring]
-        services_to_monitor.append('ceph')  # this is needed for mgr-prometheus targets
         return sorted(services_to_monitor)
 
 
index ffa6537dcf4b84d485e1c37e8a9154c50471fbc7..72a10edddb3c54bd6314ff8f0670909db92ec9be 100644 (file)
@@ -1145,6 +1145,8 @@ class TestMonitoring:
     @patch("cephadm.module.CephadmOrchestrator._get_mgr_ips", lambda _: ['192.168.100.100', '::1'])
     def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+        pool = 'testpool'
+        group = 'mygroup'
         s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast')
         with with_host(cephadm_module, 'test'):
             # host "test" needs to have networks for keepalive to be placed
@@ -1165,6 +1167,9 @@ class TestMonitoring:
                                                              keepalived_password='12345',
                                                              virtual_ip="1.2.3.4/32",
                                                              backend_service='rgw.foo')) as _, \
+                    with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}',
+                                                                   group=group,
+                                                                   pool=pool)) as _, \
                     with_service(cephadm_module, PrometheusSpec('prometheus',
                                                                 networks=['1.2.3.0/24'],
                                                                 only_bind_port_on_networks=True)) as _:
@@ -1231,6 +1236,16 @@ class TestMonitoring:
                     - url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=node-exporter
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
 
+                  - job_name: 'nvmeof'
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
+                    honor_labels: true
+                    http_sd_configs:
+                    - url: http://192.168.100.100:8765/sd/prometheus/sd-config?service=nvmeof
+                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
+
 
                 """).lstrip()
 
@@ -1282,6 +1297,8 @@ class TestMonitoring:
     def test_prometheus_config_security_enabled(self, _run_cephadm, _get_uname, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
         _get_uname.return_value = 'test'
+        pool = 'testpool'
+        group = 'mygroup'
         s = RGWSpec(service_id="foo", placement=PlacementSpec(count=1), rgw_frontend_type='beast')
         smb_spec = SMBSpec(cluster_id='foxtrot', config_uri='rados://.smb/foxtrot/config.json',)
 
@@ -1312,6 +1329,9 @@ class TestMonitoring:
                                                              keepalived_password='12345',
                                                              virtual_ip="1.2.3.4/32",
                                                              backend_service='rgw.foo')) as _, \
+                    with_service(cephadm_module, NvmeofServiceSpec(service_id=f'{pool}.{group}',
+                                                                   group=group,
+                                                                   pool=pool)) as _, \
                     with_service(cephadm_module, PrometheusSpec('prometheus')) as _:
 
                 web_config = dedent("""
@@ -1442,6 +1462,27 @@ class TestMonitoring:
                         cert_file: prometheus.crt
                         key_file: prometheus.key
 
+                  - job_name: 'nvmeof'
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
+                    scheme: https
+                    tls_config:
+                      ca_file: root_cert.pem
+                      cert_file: prometheus.crt
+                      key_file: prometheus.key
+                    honor_labels: true
+                    http_sd_configs:
+                    - url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
+                      basic_auth:
+                        username: sd_user
+                        password: sd_password
+                      tls_config:
+                        ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file: prometheus.key
+
                   - job_name: 'smb'
                     relabel_configs:
                     - source_labels: [__address__]