From: Aashish Sharma Date: Wed, 29 Apr 2026 04:34:23 +0000 (+0530) Subject: mgr/dashboard: add remote write section to prometheus configuration X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7b2b62dcfcdb0fc797c94e4948c24cc60e9ebdb9;p=ceph.git mgr/dashboard: add remote write section to prometheus configuration Add cli commands to add/remove remote_write section to prometheus configuration template Fixes: https://tracker.ceph.com/issues/76316 Signed-off-by: Aashish Sharma --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index e6704e8a023..48afed682fe 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -3427,6 +3427,55 @@ Then run the following: self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name) return 'prometheus multi-cluster targets updated' + @handle_orch_error + def set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> str: + if not url or not url.strip(): + return 'Invalid URL. URL cannot be empty.' + + try: + parsed_url = urlparse(url) + host = parsed_url.hostname + + if parsed_url.scheme not in ('http', 'https'): + return 'Invalid URL. Scheme must be http or https.' + if not host: + return 'Invalid URL. Hostname is missing.' + except ValueError as e: + return f'Invalid url. {str(e)}' + + prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec) + if not prometheus_spec: + return "Service prometheus not found\n" + + if url == prometheus_spec.remote_write_url: + return f"Remote write URL '{url}' already exists.\n" + + prometheus_spec.remote_write_url = url + prometheus_spec.remote_write_allowed_metrics = '|'.join(remote_write_allowed_metrics) + + spec = ServiceSpec.from_json(prometheus_spec.to_json()) + self.apply([spec], no_overwrite=False) + + return 'prometheus remote write updated' + + @handle_orch_error + def remove_prometheus_remote_write(self, url: str) -> str: + if not url or not url.strip(): + return 'Invalid URL. URL cannot be empty.' + + prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec) + if url == prometheus_spec.remote_write_url: + prometheus_spec.remote_write_url = '' + prometheus_spec.remote_write_allowed_metrics = '' + else: + return f"Remote write URL '{url}' does not exist.\n" + if not prometheus_spec: + return "Service prometheus not found\n" + + spec = ServiceSpec.from_json(prometheus_spec.to_json()) + self.apply([spec], no_overwrite=False) + return 'prometheus remote write removed' + @handle_orch_error def set_alertmanager_access_info(self, user: str, password: str) -> str: self.set_store(AlertmanagerService.USER_CFG_KEY, user) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index afbf9cd0ca0..c2132325ee6 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -582,6 +582,8 @@ class PrometheusService(CephadmService): retention_time = get_field_from_spec(spec, 'retention_time', '15d') retention_size = get_field_from_spec(spec, 'retention_size', '0') targets = get_field_from_spec(spec, 'targets', []) + remote_write_url = get_field_from_spec(spec, 'remote_write_url', '') + remote_write_allowed_metrics = get_field_from_spec(spec, 'remote_write_allowed_metrics', '') # build service discovery end-point security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config() @@ -607,6 +609,8 @@ class PrometheusService(CephadmService): 'service_discovery_password': self.mgr.http_server.service_discovery.password, 'service_discovery_cfg': self.get_service_discovery_cfg(security_enabled, mgmt_gw_enabled), 'external_prometheus_targets': targets, + 'remote_write_url': remote_write_url, + 'remote_write_allowed_metrics': remote_write_allowed_metrics, 'cluster_fsid': self.mgr._cluster_fsid, 'clusters_credentials': cluster_credentials, 'federate_path': federate_path @@ -683,6 +687,12 @@ class PrometheusService(CephadmService): # Ceph mgrs are dependency because when mgmt-gateway is not enabled the service-discovery depends on mgrs ips deps += mgr.cache.get_daemons_by_types(['mgr']) + if spec: + prometheus_spec = cast(PrometheusSpec, spec) + + deps.append(f'remote_write_url:{prometheus_spec.remote_write_url}') + deps.append(f'remote_write_metrics:{prometheus_spec.remote_write_allowed_metrics}') + return sorted(deps) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index 2afbf606af2..17c4ef85ae2 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -105,3 +105,17 @@ scrape_configs: static_configs: - targets: ['{{ url }}'] {% endfor %} + +{% if remote_write_url != '' and remote_write_allowed_metrics != '' %} +remote_write: + - url: {{ remote_write_url }}/api/v1/write + tls_config: + insecure_skip_verify: true + write_relabel_configs: + - source_labels: [__name__] + regex: '^({{ remote_write_allowed_metrics }})$' + action: keep + - source_labels: [__name__] + regex: 'ALERTS|ALERTS_FOR_STATE' + action: drop +{% endif %} diff --git a/src/pybind/mgr/cephadm/tests/services/test_monitoring.py b/src/pybind/mgr/cephadm/tests/services/test_monitoring.py index c73f7fad620..9beadc5d253 100644 --- a/src/pybind/mgr/cephadm/tests/services/test_monitoring.py +++ b/src/pybind/mgr/cephadm/tests/services/test_monitoring.py @@ -745,6 +745,7 @@ class TestMonitoring: - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + """).lstrip() _run_cephadm.assert_called_with( @@ -782,8 +783,8 @@ class TestMonitoring: "use_url_prefix": False }, }), - error_ok=True, use_current_daemon_image=False, + error_ok=True, ) @patch("cephadm.module.CephadmOrchestrator.get_unique_name") @@ -1005,6 +1006,7 @@ class TestMonitoring: key_file: prometheus.key + """).lstrip() _run_cephadm.assert_called_with( @@ -1046,8 +1048,8 @@ class TestMonitoring: 'web_config': '/etc/prometheus/web.yml' }, }), - error_ok=True, use_current_daemon_image=False, + error_ok=True, ) @patch("cephadm.serve.CephadmServe._run_cephadm") diff --git a/src/pybind/mgr/dashboard/controllers/prometheus.py b/src/pybind/mgr/dashboard/controllers/prometheus.py index f44de668786..1fbf8186bda 100644 --- a/src/pybind/mgr/dashboard/controllers/prometheus.py +++ b/src/pybind/mgr/dashboard/controllers/prometheus.py @@ -4,7 +4,7 @@ import os import tempfile import time from datetime import datetime -from typing import NamedTuple, Optional +from typing import List, NamedTuple, Optional import requests @@ -261,6 +261,17 @@ class Prometheus(PrometheusRESTController): return self.alert_proxy('GET', f'/alerts/groups?filter=cluster={fsid}', params) return self.alert_proxy('GET', '/alerts/groups', params) + @RESTController.Collection(method='PUT', path='/set_remote_write') + def set_remote_write(self, remote_write_url: str, remote_write_allowed_metrics: List[str]): + orch_client = OrchClient.instance() + return orch_client.monitoring.set_prometheus_remote_write(remote_write_url, + remote_write_allowed_metrics) + + @RESTController.Collection(method='PUT', path='/remove_remote_write') + def remove_remote_write(self, url: str): + orch_client = OrchClient.instance() + return orch_client.monitoring.remove_prometheus_remote_write(url) + @RESTController.Collection(method='GET', path='/prometheus_query_data') def get_prometeus_query_data(self, **params): params['query'] = params.pop('params') diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml index a43c312b4e6..24d98a0f0f9 100644 --- a/src/pybind/mgr/dashboard/openapi.yaml +++ b/src/pybind/mgr/dashboard/openapi.yaml @@ -17383,6 +17383,51 @@ paths: - jwt: [] tags: - Prometheus + /api/prometheus/remove_remote_write: + put: + parameters: [] + requestBody: + content: + application/json: + schema: + properties: + url: + type: string + required: + - url + type: object + responses: + '200': + content: + application/json: + schema: + type: object + application/vnd.ceph.api.v1.0+json: + schema: + type: object + description: Resource updated. + '202': + content: + application/json: + schema: + type: object + application/vnd.ceph.api.v1.0+json: + schema: + type: object + description: Operation is still executing. Please check the task queue. + '400': + description: Operation exception. Please check the response body for details. + '401': + description: Unauthenticated access. Please login first. + '403': + description: Unauthorized access. Please check your permissions. + '500': + description: Unexpected error. Please check the response body for the stack + trace. + security: + - jwt: [] + tags: + - Prometheus /api/prometheus/rules: get: parameters: [] @@ -17409,6 +17454,54 @@ paths: - jwt: [] tags: - Prometheus + /api/prometheus/set_remote_write: + put: + parameters: [] + requestBody: + content: + application/json: + schema: + properties: + remote_write_allowed_metrics: + type: string + remote_write_url: + type: string + required: + - remote_write_url + - remote_write_allowed_metrics + type: object + responses: + '200': + content: + application/json: + schema: + type: object + application/vnd.ceph.api.v1.0+json: + schema: + type: object + description: Resource updated. + '202': + content: + application/json: + schema: + type: object + application/vnd.ceph.api.v1.0+json: + schema: + type: object + description: Operation is still executing. Please check the task queue. + '400': + description: Operation exception. Please check the response body for details. + '401': + description: Unauthenticated access. Please login first. + '403': + description: Unauthorized access. Please check your permissions. + '500': + description: Unexpected error. Please check the response body for the stack + trace. + security: + - jwt: [] + tags: + - Prometheus /api/prometheus/silence: post: parameters: [] diff --git a/src/pybind/mgr/dashboard/services/orchestrator.py b/src/pybind/mgr/dashboard/services/orchestrator.py index e2495a44444..f6940edc1d4 100644 --- a/src/pybind/mgr/dashboard/services/orchestrator.py +++ b/src/pybind/mgr/dashboard/services/orchestrator.py @@ -245,6 +245,18 @@ class MonitoringManager(ResourceManager): """Get security config information""" return self.api.get_security_config() + @wait_api_result + def set_prometheus_remote_write(self, remote_write_url: str, + remote_write_allowed_metrics: List[str]) -> str: + """Set Prometheus remote write configuration""" + return self.api.set_prometheus_remote_write(remote_write_url, + remote_write_allowed_metrics) + + @wait_api_result + def remove_prometheus_remote_write(self, remote_write_url: str) -> str: + """Remove Prometheus remote write configuration""" + return self.api.remove_prometheus_remote_write(remote_write_url) + class OrchClient(object): diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 136fde595ac..a25b09ea6f3 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -862,6 +862,14 @@ class Orchestrator(object): """remove prometheus target for multi-cluster""" raise NotImplementedError() + def set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> OrchResult[str]: + """set prometheus remote write url and allowed metrics for multi-cluster""" + raise NotImplementedError() + + def remove_prometheus_remote_write(self, url: str) -> OrchResult[str]: + """remove prometheus remote write url and allowed metrics for multi-cluster""" + raise NotImplementedError() + def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]: """get alertmanager access information""" raise NotImplementedError() diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 83db20187f7..f47775ae05d 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1432,6 +1432,18 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule): result = raise_if_exception(completion) return HandleCommandResult(stdout=json.dumps(result)) + @OrchestratorCLICommand.Write('orch prometheus set-remote-write') + def _set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> HandleCommandResult: + completion = self.set_prometheus_remote_write(url, remote_write_allowed_metrics) + result = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(result)) + + @OrchestratorCLICommand.Write('orch prometheus remove-remote-write') + def _remove_prometheus_remote_write(self, url: str) -> HandleCommandResult: + completion = self.remove_prometheus_remote_write(url) + result = raise_if_exception(completion) + return HandleCommandResult(stdout=json.dumps(result)) + @OrchestratorCLICommand.Write('orch alertmanager set-credentials') def _set_alertmanager_access_info(self, username: Optional[str] = None, password: Optional[str] = None, inbuf: Optional[str] = None) -> HandleCommandResult: try: diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 246a9b898c6..354d682a1e3 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -973,6 +973,8 @@ class ServiceSpec(object): preview_only: bool = False, networks: Optional[List[str]] = None, targets: Optional[List[str]] = None, + remote_write_url: Optional[str] = None, + remote_write_allowed_metrics: Optional[str] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -1021,6 +1023,8 @@ class ServiceSpec(object): #: :ref:`cephadm-rgw-networks` and :ref:`cephadm-mgr-networks`. self.networks: List[str] = networks or [] self.targets: List[str] = targets or [] + self.remote_write_url = remote_write_url + self.remote_write_allowed_metrics = remote_write_allowed_metrics self.config: Optional[Dict[str, str]] = None if config: @@ -2894,6 +2898,8 @@ class MonitoringSpec(ServiceSpec): preview_only: bool = False, port: Optional[int] = None, targets: Optional[List[str]] = None, + remote_write_url: Optional[str] = None, + remote_write_allowed_metrics: Optional[str] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -2908,7 +2914,9 @@ class MonitoringSpec(ServiceSpec): preview_only=preview_only, config=config, networks=networks, extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args, - custom_configs=custom_configs, targets=targets) + custom_configs=custom_configs, targets=targets, + remote_write_url=remote_write_url, + remote_write_allowed_metrics=remote_write_allowed_metrics) self.service_type = service_type self.port = port @@ -3081,6 +3089,8 @@ class PrometheusSpec(MonitoringSpec): retention_time: Optional[str] = None, retention_size: Optional[str] = None, targets: Optional[List[str]] = None, + remote_write_url: Optional[str] = None, + remote_write_allowed_metrics: Optional[str] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, custom_configs: Optional[List[CustomConfig]] = None, @@ -3092,7 +3102,8 @@ class PrometheusSpec(MonitoringSpec): ssl=ssl, certificate_source=certificate_source, preview_only=preview_only, config=config, networks=networks, port=port, targets=targets, extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args, - custom_configs=custom_configs) + custom_configs=custom_configs, remote_write_url=remote_write_url, + remote_write_allowed_metrics=remote_write_allowed_metrics) self.retention_time = retention_time.strip() if retention_time else None self.retention_size = retention_size.strip() if retention_size else None