]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: add prometheus federation config for mulkti-cluster 57255/head
authorAashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Tue, 12 Dec 2023 08:38:26 +0000 (14:08 +0530)
committerAashish Sharma <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Fri, 3 May 2024 11:57:43 +0000 (17:27 +0530)
monitoring

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
(cherry picked from commit 82b50b4eac819a71542d766b573f65819046f403)

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
src/pybind/mgr/cephadm/tests/test_services.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/module.py
src/python-common/ceph/deployment/service_spec.py

index f1d234fb236730c4639098d46f69e2bfef3e4c3f..7945f52940b766601caa73d9a302acd13081f405 100644 (file)
@@ -16,6 +16,8 @@ from threading import Event
 
 from cephadm.service_discovery import ServiceDiscovery
 
+from ceph.deployment.service_spec import PrometheusSpec
+
 import string
 from typing import List, Dict, Optional, Callable, Tuple, TypeVar, \
     Any, Set, TYPE_CHECKING, cast, NamedTuple, Sequence, Type, \
@@ -2984,6 +2986,38 @@ Then run the following:
         self.set_store(PrometheusService.PASS_CFG_KEY, password)
         return 'prometheus credentials updated correctly'
 
+    @handle_orch_error
+    def set_prometheus_target(self, url: str) -> str:
+        prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
+        if url not in prometheus_spec.targets:
+            prometheus_spec.targets.append(url)
+        else:
+            return f"Target '{url}' already exists.\n"
+        if not prometheus_spec:
+            return "Service prometheus not found\n"
+        daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus')
+        spec = ServiceSpec.from_json(prometheus_spec.to_json())
+        self.apply([spec], no_overwrite=False)
+        for daemon in daemons:
+            self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name)
+        return 'prometheus multi-cluster targets updated'
+
+    @handle_orch_error
+    def remove_prometheus_target(self, url: str) -> str:
+        prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
+        if url in prometheus_spec.targets:
+            prometheus_spec.targets.remove(url)
+        else:
+            return f"Target '{url}' does not exist.\n"
+        if not prometheus_spec:
+            return "Service prometheus not found\n"
+        daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus')
+        spec = ServiceSpec.from_json(prometheus_spec.to_json())
+        self.apply([spec], no_overwrite=False)
+        for daemon in daemons:
+            self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name)
+        return 'prometheus multi-cluster targets updated'
+
     @handle_orch_error
     def set_alertmanager_access_info(self, user: str, password: str) -> str:
         self.set_store(AlertmanagerService.USER_CFG_KEY, user)
index 10ddcbbd02f056935fffd816c1b5d0393f906e25..0576d4652d0399cb54e5c25d6f56b8163fe74dc2 100644 (file)
@@ -387,12 +387,17 @@ class PrometheusService(CephadmService):
 
         assert self.TYPE == daemon_spec.daemon_type
         spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
-
         try:
             retention_time = spec.retention_time if spec.retention_time else '15d'
         except AttributeError:
             retention_time = '15d'
 
+        try:
+            targets = spec.targets
+        except AttributeError:
+            logger.warning('Prometheus targets not found in the spec. Using empty list.')
+            targets = []
+
         try:
             retention_size = spec.retention_size if spec.retention_size else '0'
         except AttributeError:
@@ -417,6 +422,7 @@ class PrometheusService(CephadmService):
 
         alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
         prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
+        FSID = self.mgr._cluster_fsid
 
         # generate the prometheus configuration
         context = {
@@ -431,6 +437,8 @@ class PrometheusService(CephadmService):
             'haproxy_sd_url': haproxy_sd_url,
             'ceph_exporter_sd_url': ceph_exporter_sd_url,
             'nvmeof_sd_url': nvmeof_sd_url,
+            'external_prometheus_targets': targets,
+            'cluster_fsid': FSID
         }
 
         ip_to_bind_to = ''
index 57d2f8a3f4b46ceb3eaa8e638d3a26551e94e3af..931913668ae8cecdca2804b183d86147378ef702 100644 (file)
@@ -2,6 +2,11 @@
 global:
   scrape_interval: 10s
   evaluation_interval: 10s
+{% if not secure_monitoring_stack %}
+  external_labels:
+    cluster: {{ cluster_fsid }}
+{% endif %}
+
 rule_files:
   - /etc/prometheus/alerting/*
 
@@ -45,6 +50,10 @@ scrape_configs:
         ca_file: root_cert.pem
 {% else %}
     honor_labels: true
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: cluster
+      replacement: {{ cluster_fsid }}
     http_sd_configs:
     - url: {{ mgr_prometheus_sd_url }}
 {% endif %}
@@ -65,6 +74,10 @@ scrape_configs:
 {% else %}
     http_sd_configs:
     - url: {{ node_exporter_sd_url }}
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: cluster
+      replacement: {{ cluster_fsid }}
 {% endif %}
 {% endif %}
 
@@ -84,6 +97,10 @@ scrape_configs:
 {% else %}
     http_sd_configs:
     - url: {{ haproxy_sd_url }}
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: cluster
+      replacement: {{ cluster_fsid }}
 {% endif %}
 {% endif %}
 
@@ -103,6 +120,10 @@ scrape_configs:
         ca_file: root_cert.pem
 {% else %}
     honor_labels: true
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: cluster
+      replacement: {{ cluster_fsid }}
     http_sd_configs:
     - url: {{ ceph_exporter_sd_url }}
 {% endif %}
@@ -127,3 +148,19 @@ scrape_configs:
     - url: {{ nvmeof_sd_url }}
 {% endif %}
 {% endif %}
+
+{% if not secure_monitoring_stack %}
+  - job_name: 'federate'
+    scrape_interval: 15s
+    honor_labels: true
+    metrics_path: '/federate'
+    params:
+      'match[]':
+        - '{job="ceph"}'
+        - '{job="node"}'
+        - '{job="haproxy"}'
+        - '{job="ceph-exporter"}'
+    static_configs:
+    - targets: {{ external_prometheus_targets }}
+{% endif %}
+
index 1265a39f69055d4f998aee30a3251ab27f4e9720..b6a407e091e69adb271ee99104924ea38103cf30 100644 (file)
@@ -677,6 +677,9 @@ class TestMonitoring:
                 global:
                   scrape_interval: 10s
                   evaluation_interval: 10s
+                  external_labels:
+                    cluster: fsid
+
                 rule_files:
                   - /etc/prometheus/alerting/*
 
@@ -689,25 +692,54 @@ class TestMonitoring:
                 scrape_configs:
                   - job_name: 'ceph'
                     honor_labels: true
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus
 
                   - job_name: 'node'
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
 
                   - job_name: 'haproxy'
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
 
                   - job_name: 'ceph-exporter'
                     honor_labels: true
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter
 
                   - job_name: 'nvmeof'
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
+
+                  - job_name: 'federate'
+                    scrape_interval: 15s
+                    honor_labels: true
+                    metrics_path: '/federate'
+                    params:
+                      'match[]':
+                        - '{job="ceph"}'
+                        - '{job="node"}'
+                        - '{job="haproxy"}'
+                        - '{job="ceph-exporter"}'
+                    static_configs:
+                    - targets: []
                 """).lstrip()
 
                 _run_cephadm.assert_called_with(
@@ -797,6 +829,7 @@ class TestMonitoring:
                 global:
                   scrape_interval: 10s
                   evaluation_interval: 10s
+
                 rule_files:
                   - /etc/prometheus/alerting/*
 
@@ -879,6 +912,7 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+
                 """).lstrip()
 
                 _run_cephadm.assert_called_with(
index 042572ec19418065c3a84aa3b9bc0ea4a5f4ea63..04eb70a733620fc70d6bb7b2497511e25b949d3c 100644 (file)
@@ -775,6 +775,14 @@ class Orchestrator(object):
         """set prometheus access information"""
         raise NotImplementedError()
 
+    def set_prometheus_target(self, url: str) -> OrchResult[str]:
+        """set prometheus target for multi-cluster"""
+        raise NotImplementedError()
+
+    def remove_prometheus_target(self, url: str) -> OrchResult[str]:
+        """remove prometheus target for multi-cluster"""
+        raise NotImplementedError()
+
     def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]:
         """get alertmanager access information"""
         raise NotImplementedError()
index e69df3e89e56af5dbfc2bd2de4804e2cf39d9cb5..987dbf2f854bfe1b5996fcf0858e3a0fad489d79 100644 (file)
@@ -1143,6 +1143,18 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
         except ArgumentError as e:
             return HandleCommandResult(-errno.EINVAL, "", (str(e)))
 
+    @_cli_write_command('orch prometheus set-target')
+    def _set_prometheus_target(self, url: str) -> HandleCommandResult:
+        completion = self.set_prometheus_target(url)
+        result = raise_if_exception(completion)
+        return HandleCommandResult(stdout=json.dumps(result))
+
+    @_cli_write_command('orch prometheus remove-target')
+    def _remove_prometheus_target(self, url: str) -> HandleCommandResult:
+        completion = self.remove_prometheus_target(url)
+        result = raise_if_exception(completion)
+        return HandleCommandResult(stdout=json.dumps(result))
+
     @_cli_write_command('orch alertmanager set-credentials')
     def _set_alertmanager_access_info(self, username: Optional[str] = None, password: Optional[str] = None, inbuf: Optional[str] = None) -> HandleCommandResult:
         try:
index 704dfe6f01b1a7cc54ec7010306b9fb89bf179da..39d00e94fe658ecf5827910db36adeb55d4c0719 100644 (file)
@@ -806,6 +806,7 @@ class ServiceSpec(object):
                  unmanaged: bool = False,
                  preview_only: bool = False,
                  networks: Optional[List[str]] = None,
+                 targets: Optional[List[str]] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -842,6 +843,7 @@ class ServiceSpec(object):
         #: :ref:`cephadm-monitoring-networks-ports`,
         #: :ref:`cephadm-rgw-networks` and :ref:`cephadm-mgr-networks`.
         self.networks: List[str] = networks or []
+        self.targets: List[str] = targets or []
 
         self.config: Optional[Dict[str, str]] = None
         if config:
@@ -1588,6 +1590,7 @@ class MonitoringSpec(ServiceSpec):
                  unmanaged: bool = False,
                  preview_only: bool = False,
                  port: Optional[int] = None,
+                 targets: Optional[List[str]] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -1601,7 +1604,7 @@ class MonitoringSpec(ServiceSpec):
             preview_only=preview_only, config=config,
             networks=networks, extra_container_args=extra_container_args,
             extra_entrypoint_args=extra_entrypoint_args,
-            custom_configs=custom_configs)
+            custom_configs=custom_configs, targets=targets)
 
         self.service_type = service_type
         self.port = port
@@ -1744,6 +1747,7 @@ class PrometheusSpec(MonitoringSpec):
                  port: Optional[int] = None,
                  retention_time: Optional[str] = None,
                  retention_size: Optional[str] = None,
+                 targets: Optional[List[str]] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -1752,7 +1756,7 @@ class PrometheusSpec(MonitoringSpec):
         super(PrometheusSpec, self).__init__(
             'prometheus', service_id=service_id,
             placement=placement, unmanaged=unmanaged,
-            preview_only=preview_only, config=config, networks=networks, port=port,
+            preview_only=preview_only, config=config, networks=networks, port=port, targets=targets,
             extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args,
             custom_configs=custom_configs)