From 36607f7728b56921d381e9a720a451914db4f3e0 Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Tue, 12 Dec 2023 14:08:26 +0530 Subject: [PATCH] mgr/dashboard: add prometheus federation config for mulkti-cluster monitoring Signed-off-by: Aashish Sharma (cherry picked from commit 82b50b4eac819a71542d766b573f65819046f403) --- src/pybind/mgr/cephadm/module.py | 34 +++++++++++++++++ src/pybind/mgr/cephadm/services/monitoring.py | 10 ++++- .../services/prometheus/prometheus.yml.j2 | 37 +++++++++++++++++++ src/pybind/mgr/cephadm/tests/test_services.py | 34 +++++++++++++++++ src/pybind/mgr/orchestrator/_interface.py | 8 ++++ src/pybind/mgr/orchestrator/module.py | 12 ++++++ .../ceph/deployment/service_spec.py | 8 +++- 7 files changed, 140 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index f1d234fb23673..7945f52940b76 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -16,6 +16,8 @@ from threading import Event from cephadm.service_discovery import ServiceDiscovery +from ceph.deployment.service_spec import PrometheusSpec + import string from typing import List, Dict, Optional, Callable, Tuple, TypeVar, \ Any, Set, TYPE_CHECKING, cast, NamedTuple, Sequence, Type, \ @@ -2984,6 +2986,38 @@ Then run the following: self.set_store(PrometheusService.PASS_CFG_KEY, password) return 'prometheus credentials updated correctly' + @handle_orch_error + def set_prometheus_target(self, url: str) -> str: + prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec) + if url not in prometheus_spec.targets: + prometheus_spec.targets.append(url) + else: + return f"Target '{url}' already exists.\n" + if not prometheus_spec: + return "Service prometheus not found\n" + daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus') + spec = ServiceSpec.from_json(prometheus_spec.to_json()) + self.apply([spec], no_overwrite=False) + for daemon in daemons: + self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name) + return 'prometheus multi-cluster targets updated' + + @handle_orch_error + def remove_prometheus_target(self, url: str) -> str: + prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec) + if url in prometheus_spec.targets: + prometheus_spec.targets.remove(url) + else: + return f"Target '{url}' does not exist.\n" + if not prometheus_spec: + return "Service prometheus not found\n" + daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('prometheus') + spec = ServiceSpec.from_json(prometheus_spec.to_json()) + self.apply([spec], no_overwrite=False) + for daemon in daemons: + self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name) + return 'prometheus multi-cluster targets updated' + @handle_orch_error def set_alertmanager_access_info(self, user: str, password: str) -> str: self.set_store(AlertmanagerService.USER_CFG_KEY, user) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 10ddcbbd02f05..0576d4652d039 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -387,12 +387,17 @@ class PrometheusService(CephadmService): assert self.TYPE == daemon_spec.daemon_type spec = cast(PrometheusSpec, self.mgr.spec_store[daemon_spec.service_name].spec) - try: retention_time = spec.retention_time if spec.retention_time else '15d' except AttributeError: retention_time = '15d' + try: + targets = spec.targets + except AttributeError: + logger.warning('Prometheus targets not found in the spec. Using empty list.') + targets = [] + try: retention_size = spec.retention_size if spec.retention_size else '0' except AttributeError: @@ -417,6 +422,7 @@ class PrometheusService(CephadmService): alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() + FSID = self.mgr._cluster_fsid # generate the prometheus configuration context = { @@ -431,6 +437,8 @@ class PrometheusService(CephadmService): 'haproxy_sd_url': haproxy_sd_url, 'ceph_exporter_sd_url': ceph_exporter_sd_url, 'nvmeof_sd_url': nvmeof_sd_url, + 'external_prometheus_targets': targets, + 'cluster_fsid': FSID } ip_to_bind_to = '' diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index 57d2f8a3f4b46..931913668ae8c 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -2,6 +2,11 @@ global: scrape_interval: 10s evaluation_interval: 10s +{% if not secure_monitoring_stack %} + external_labels: + cluster: {{ cluster_fsid }} +{% endif %} + rule_files: - /etc/prometheus/alerting/* @@ -45,6 +50,10 @@ scrape_configs: ca_file: root_cert.pem {% else %} honor_labels: true + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: {{ cluster_fsid }} http_sd_configs: - url: {{ mgr_prometheus_sd_url }} {% endif %} @@ -65,6 +74,10 @@ scrape_configs: {% else %} http_sd_configs: - url: {{ node_exporter_sd_url }} + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: {{ cluster_fsid }} {% endif %} {% endif %} @@ -84,6 +97,10 @@ scrape_configs: {% else %} http_sd_configs: - url: {{ haproxy_sd_url }} + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: {{ cluster_fsid }} {% endif %} {% endif %} @@ -103,6 +120,10 @@ scrape_configs: ca_file: root_cert.pem {% else %} honor_labels: true + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: {{ cluster_fsid }} http_sd_configs: - url: {{ ceph_exporter_sd_url }} {% endif %} @@ -127,3 +148,19 @@ scrape_configs: - url: {{ nvmeof_sd_url }} {% endif %} {% endif %} + +{% if not secure_monitoring_stack %} + - job_name: 'federate' + scrape_interval: 15s + honor_labels: true + metrics_path: '/federate' + params: + 'match[]': + - '{job="ceph"}' + - '{job="node"}' + - '{job="haproxy"}' + - '{job="ceph-exporter"}' + static_configs: + - targets: {{ external_prometheus_targets }} +{% endif %} + diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 1265a39f69055..b6a407e091e69 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -677,6 +677,9 @@ class TestMonitoring: global: scrape_interval: 10s evaluation_interval: 10s + external_labels: + cluster: fsid + rule_files: - /etc/prometheus/alerting/* @@ -689,25 +692,54 @@ class TestMonitoring: scrape_configs: - job_name: 'ceph' honor_labels: true + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus - job_name: 'node' http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid - job_name: 'haproxy' http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid - job_name: 'ceph-exporter' honor_labels: true + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter - job_name: 'nvmeof' http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + + - job_name: 'federate' + scrape_interval: 15s + honor_labels: true + metrics_path: '/federate' + params: + 'match[]': + - '{job="ceph"}' + - '{job="node"}' + - '{job="haproxy"}' + - '{job="ceph-exporter"}' + static_configs: + - targets: [] """).lstrip() _run_cephadm.assert_called_with( @@ -797,6 +829,7 @@ class TestMonitoring: global: scrape_interval: 10s evaluation_interval: 10s + rule_files: - /etc/prometheus/alerting/* @@ -879,6 +912,7 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + """).lstrip() _run_cephadm.assert_called_with( diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 042572ec19418..04eb70a733620 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -775,6 +775,14 @@ class Orchestrator(object): """set prometheus access information""" raise NotImplementedError() + def set_prometheus_target(self, url: str) -> OrchResult[str]: + """set prometheus target for multi-cluster""" + raise NotImplementedError() + + def remove_prometheus_target(self, url: str) -> OrchResult[str]: + """remove prometheus target for multi-cluster""" + raise NotImplementedError() + def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]: """get alertmanager access information""" raise NotImplementedError() diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index e69df3e89e56a..987dbf2f854bf 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1143,6 +1143,18 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, except ArgumentError as e: return HandleCommandResult(-errno.EINVAL, "", (str(e))) + @_cli_write_command('orch prometheus set-target') + def _set_prometheus_target(self, url: str) -> HandleCommandResult: + completion = self.set_prometheus_target(url) + result = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(result)) + + @_cli_write_command('orch prometheus remove-target') + def _remove_prometheus_target(self, url: str) -> HandleCommandResult: + completion = self.remove_prometheus_target(url) + result = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(result)) + @_cli_write_command('orch alertmanager set-credentials') def _set_alertmanager_access_info(self, username: Optional[str] = None, password: Optional[str] = None, inbuf: Optional[str] = None) -> HandleCommandResult: try: diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 704dfe6f01b1a..39d00e94fe658 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -806,6 +806,7 @@ class ServiceSpec(object): unmanaged: bool = False, preview_only: bool = False, networks: Optional[List[str]] = None, + targets: Optional[List[str]] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -842,6 +843,7 @@ class ServiceSpec(object): #: :ref:`cephadm-monitoring-networks-ports`, #: :ref:`cephadm-rgw-networks` and :ref:`cephadm-mgr-networks`. self.networks: List[str] = networks or [] + self.targets: List[str] = targets or [] self.config: Optional[Dict[str, str]] = None if config: @@ -1588,6 +1590,7 @@ class MonitoringSpec(ServiceSpec): unmanaged: bool = False, preview_only: bool = False, port: Optional[int] = None, + targets: Optional[List[str]] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -1601,7 +1604,7 @@ class MonitoringSpec(ServiceSpec): preview_only=preview_only, config=config, networks=networks, extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args, - custom_configs=custom_configs) + custom_configs=custom_configs, targets=targets) self.service_type = service_type self.port = port @@ -1744,6 +1747,7 @@ class PrometheusSpec(MonitoringSpec): port: Optional[int] = None, retention_time: Optional[str] = None, retention_size: Optional[str] = None, + targets: Optional[List[str]] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -1752,7 +1756,7 @@ class PrometheusSpec(MonitoringSpec): super(PrometheusSpec, self).__init__( 'prometheus', service_id=service_id, placement=placement, unmanaged=unmanaged, - preview_only=preview_only, config=config, networks=networks, port=port, + preview_only=preview_only, config=config, networks=networks, port=port, targets=targets, extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args, custom_configs=custom_configs) -- 2.39.5