]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/dashboard: add remote write section to prometheus configuration
authorAashish Sharma <aashish@li-e9bf2ecc-2ad7-11b2-a85c-baf05c5182ab.ibm.com>
Wed, 29 Apr 2026 04:34:23 +0000 (10:04 +0530)
committerAashish Sharma <aashish@li-e9bf2ecc-2ad7-11b2-a85c-baf05c5182ab.ibm.com>
Thu, 30 Apr 2026 13:17:10 +0000 (18:47 +0530)
Add cli commands to add/remove remote_write section to prometheus configuration template

Fixes: https://tracker.ceph.com/issues/76316
Signed-off-by: Aashish Sharma <aasharma@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
src/pybind/mgr/cephadm/tests/services/test_monitoring.py
src/pybind/mgr/dashboard/controllers/prometheus.py
src/pybind/mgr/dashboard/openapi.yaml
src/pybind/mgr/dashboard/services/orchestrator.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/module.py
src/python-common/ceph/deployment/service_spec.py

index e6704e8a023c2d4dd194164cf0f335c9f26c31bc..48afed682fe2937b7e8a9d69688a30059da20663 100644 (file)
@@ -3427,6 +3427,55 @@ Then run the following:
             self.daemon_action(action='redeploy', daemon_name=daemon.daemon_name)
         return 'prometheus multi-cluster targets updated'
 
+    @handle_orch_error
+    def set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> str:
+        if not url or not url.strip():
+            return 'Invalid URL. URL cannot be empty.'
+
+        try:
+            parsed_url = urlparse(url)
+            host = parsed_url.hostname
+
+            if parsed_url.scheme not in ('http', 'https'):
+                return 'Invalid URL. Scheme must be http or https.'
+            if not host:
+                return 'Invalid URL. Hostname is missing.'
+        except ValueError as e:
+            return f'Invalid url. {str(e)}'
+
+        prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
+        if not prometheus_spec:
+            return "Service prometheus not found\n"
+
+        if url == prometheus_spec.remote_write_url:
+            return f"Remote write URL '{url}' already exists.\n"
+
+        prometheus_spec.remote_write_url = url
+        prometheus_spec.remote_write_allowed_metrics = '|'.join(remote_write_allowed_metrics)
+
+        spec = ServiceSpec.from_json(prometheus_spec.to_json())
+        self.apply([spec], no_overwrite=False)
+
+        return 'prometheus remote write updated'
+
+    @handle_orch_error
+    def remove_prometheus_remote_write(self, url: str) -> str:
+        if not url or not url.strip():
+            return 'Invalid URL. URL cannot be empty.'
+
+        prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
+        if url == prometheus_spec.remote_write_url:
+            prometheus_spec.remote_write_url = ''
+            prometheus_spec.remote_write_allowed_metrics = ''
+        else:
+            return f"Remote write URL '{url}' does not exist.\n"
+        if not prometheus_spec:
+            return "Service prometheus not found\n"
+
+        spec = ServiceSpec.from_json(prometheus_spec.to_json())
+        self.apply([spec], no_overwrite=False)
+        return 'prometheus remote write removed'
+
     @handle_orch_error
     def set_alertmanager_access_info(self, user: str, password: str) -> str:
         self.set_store(AlertmanagerService.USER_CFG_KEY, user)
index afbf9cd0ca0aef2c8ee5676df641591fd67e1802..c2132325ee6c25173b8d3a1b8b9f03f35a995f69 100644 (file)
@@ -582,6 +582,8 @@ class PrometheusService(CephadmService):
         retention_time = get_field_from_spec(spec, 'retention_time', '15d')
         retention_size = get_field_from_spec(spec, 'retention_size', '0')
         targets = get_field_from_spec(spec, 'targets', [])
+        remote_write_url = get_field_from_spec(spec, 'remote_write_url', '')
+        remote_write_allowed_metrics = get_field_from_spec(spec, 'remote_write_allowed_metrics', '')
 
         # build service discovery end-point
         security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
@@ -607,6 +609,8 @@ class PrometheusService(CephadmService):
             'service_discovery_password': self.mgr.http_server.service_discovery.password,
             'service_discovery_cfg': self.get_service_discovery_cfg(security_enabled, mgmt_gw_enabled),
             'external_prometheus_targets': targets,
+            'remote_write_url': remote_write_url,
+            'remote_write_allowed_metrics': remote_write_allowed_metrics,
             'cluster_fsid': self.mgr._cluster_fsid,
             'clusters_credentials': cluster_credentials,
             'federate_path': federate_path
@@ -683,6 +687,12 @@ class PrometheusService(CephadmService):
             # Ceph mgrs are dependency because when mgmt-gateway is not enabled the service-discovery depends on mgrs ips
             deps += mgr.cache.get_daemons_by_types(['mgr'])
 
+        if spec:
+            prometheus_spec = cast(PrometheusSpec, spec)
+
+            deps.append(f'remote_write_url:{prometheus_spec.remote_write_url}')
+            deps.append(f'remote_write_metrics:{prometheus_spec.remote_write_allowed_metrics}')
+
         return sorted(deps)
 
     def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
index 2afbf606af2b86ea2faadbe7a841c931f9b179e7..17c4ef85ae2bb4a75332d4a9a04d68c3b95161e3 100644 (file)
@@ -105,3 +105,17 @@ scrape_configs:
     static_configs:
     - targets: ['{{ url }}']
 {% endfor %}
+
+{% if remote_write_url != '' and remote_write_allowed_metrics != '' %}
+remote_write:
+  - url: {{ remote_write_url }}/api/v1/write
+    tls_config:
+      insecure_skip_verify: true
+    write_relabel_configs:
+      - source_labels: [__name__]
+        regex: '^({{ remote_write_allowed_metrics }})$'
+        action: keep
+      - source_labels: [__name__]
+        regex: 'ALERTS|ALERTS_FOR_STATE'
+        action: drop
+{% endif %}
index c73f7fad620d3a0bb1e8f6a93eb5e81434438943..9beadc5d253a5f9d549cfae49ad5ea06f004f76b 100644 (file)
@@ -745,6 +745,7 @@ class TestMonitoring:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof
 
 
+
                 """).lstrip()
 
                 _run_cephadm.assert_called_with(
@@ -782,8 +783,8 @@ class TestMonitoring:
                             "use_url_prefix": False
                         },
                     }),
-                    error_ok=True,
                     use_current_daemon_image=False,
+                    error_ok=True,
                 )
 
     @patch("cephadm.module.CephadmOrchestrator.get_unique_name")
@@ -1005,6 +1006,7 @@ class TestMonitoring:
                         key_file: prometheus.key
 
 
+
                 """).lstrip()
 
                 _run_cephadm.assert_called_with(
@@ -1046,8 +1048,8 @@ class TestMonitoring:
                             'web_config': '/etc/prometheus/web.yml'
                         },
                     }),
-                    error_ok=True,
                     use_current_daemon_image=False,
+                    error_ok=True,
                 )
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
index f44de66878638b391b759c94f2e4a67db302e53c..1fbf8186bda4b20eff37c136ddc121194b8c0458 100644 (file)
@@ -4,7 +4,7 @@ import os
 import tempfile
 import time
 from datetime import datetime
-from typing import NamedTuple, Optional
+from typing import List, NamedTuple, Optional
 
 import requests
 
@@ -261,6 +261,17 @@ class Prometheus(PrometheusRESTController):
             return self.alert_proxy('GET', f'/alerts/groups?filter=cluster={fsid}', params)
         return self.alert_proxy('GET', '/alerts/groups', params)
 
+    @RESTController.Collection(method='PUT', path='/set_remote_write')
+    def set_remote_write(self, remote_write_url: str, remote_write_allowed_metrics: List[str]):
+        orch_client = OrchClient.instance()
+        return orch_client.monitoring.set_prometheus_remote_write(remote_write_url,
+                                                                  remote_write_allowed_metrics)
+
+    @RESTController.Collection(method='PUT', path='/remove_remote_write')
+    def remove_remote_write(self, url: str):
+        orch_client = OrchClient.instance()
+        return orch_client.monitoring.remove_prometheus_remote_write(url)
+
     @RESTController.Collection(method='GET', path='/prometheus_query_data')
     def get_prometeus_query_data(self, **params):
         params['query'] = params.pop('params')
index a43c312b4e67b742a2c5d162d5a45bd78ba180b2..24d98a0f0f911729e9c3dec3803bcb05ef4f75ac 100644 (file)
@@ -17383,6 +17383,51 @@ paths:
       - jwt: []
       tags:
       - Prometheus
+  /api/prometheus/remove_remote_write:
+    put:
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              properties:
+                url:
+                  type: string
+              required:
+              - url
+              type: object
+      responses:
+        '200':
+          content:
+            application/json:
+              schema:
+                type: object
+            application/vnd.ceph.api.v1.0+json:
+              schema:
+                type: object
+          description: Resource updated.
+        '202':
+          content:
+            application/json:
+              schema:
+                type: object
+            application/vnd.ceph.api.v1.0+json:
+              schema:
+                type: object
+          description: Operation is still executing. Please check the task queue.
+        '400':
+          description: Operation exception. Please check the response body for details.
+        '401':
+          description: Unauthenticated access. Please login first.
+        '403':
+          description: Unauthorized access. Please check your permissions.
+        '500':
+          description: Unexpected error. Please check the response body for the stack
+            trace.
+      security:
+      - jwt: []
+      tags:
+      - Prometheus
   /api/prometheus/rules:
     get:
       parameters: []
@@ -17409,6 +17454,54 @@ paths:
       - jwt: []
       tags:
       - Prometheus
+  /api/prometheus/set_remote_write:
+    put:
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              properties:
+                remote_write_allowed_metrics:
+                  type: string
+                remote_write_url:
+                  type: string
+              required:
+              - remote_write_url
+              - remote_write_allowed_metrics
+              type: object
+      responses:
+        '200':
+          content:
+            application/json:
+              schema:
+                type: object
+            application/vnd.ceph.api.v1.0+json:
+              schema:
+                type: object
+          description: Resource updated.
+        '202':
+          content:
+            application/json:
+              schema:
+                type: object
+            application/vnd.ceph.api.v1.0+json:
+              schema:
+                type: object
+          description: Operation is still executing. Please check the task queue.
+        '400':
+          description: Operation exception. Please check the response body for details.
+        '401':
+          description: Unauthenticated access. Please login first.
+        '403':
+          description: Unauthorized access. Please check your permissions.
+        '500':
+          description: Unexpected error. Please check the response body for the stack
+            trace.
+      security:
+      - jwt: []
+      tags:
+      - Prometheus
   /api/prometheus/silence:
     post:
       parameters: []
index e2495a44444bee45e0f6595a937cbd40207baf63..f6940edc1d4763bfc87640d3a2514dc2a1a595bb 100644 (file)
@@ -245,6 +245,18 @@ class MonitoringManager(ResourceManager):
         """Get security config information"""
         return self.api.get_security_config()
 
+    @wait_api_result
+    def set_prometheus_remote_write(self, remote_write_url: str,
+                                    remote_write_allowed_metrics: List[str]) -> str:
+        """Set Prometheus remote write configuration"""
+        return self.api.set_prometheus_remote_write(remote_write_url,
+                                                    remote_write_allowed_metrics)
+
+    @wait_api_result
+    def remove_prometheus_remote_write(self, remote_write_url: str) -> str:
+        """Remove Prometheus remote write configuration"""
+        return self.api.remove_prometheus_remote_write(remote_write_url)
+
 
 class OrchClient(object):
 
index 136fde595ac05abde23e2a7785510544f07577e6..a25b09ea6f30b4cfa298cbb5bc017e30a1c01e11 100644 (file)
@@ -862,6 +862,14 @@ class Orchestrator(object):
         """remove prometheus target for multi-cluster"""
         raise NotImplementedError()
 
+    def set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> OrchResult[str]:
+        """set prometheus remote write url and allowed metrics for multi-cluster"""
+        raise NotImplementedError()
+
+    def remove_prometheus_remote_write(self, url: str) -> OrchResult[str]:
+        """remove prometheus remote write url and allowed metrics for multi-cluster"""
+        raise NotImplementedError()
+
     def get_alertmanager_access_info(self) -> OrchResult[Dict[str, str]]:
         """get alertmanager access information"""
         raise NotImplementedError()
index 83db20187f7b54ee6cf2e80e2b862f2c3a2f2a2e..f47775ae05d030536d185a4ab92c1ae601cabf42 100644 (file)
@@ -1432,6 +1432,18 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule):
         result = raise_if_exception(completion)
         return HandleCommandResult(stdout=json.dumps(result))
 
+    @OrchestratorCLICommand.Write('orch prometheus set-remote-write')
+    def _set_prometheus_remote_write(self, url: str, remote_write_allowed_metrics: List[str]) -> HandleCommandResult:
+        completion = self.set_prometheus_remote_write(url, remote_write_allowed_metrics)
+        result = raise_if_exception(completion)
+        return HandleCommandResult(stdout=json.dumps(result))
+
+    @OrchestratorCLICommand.Write('orch prometheus remove-remote-write')
+    def _remove_prometheus_remote_write(self, url: str) -> HandleCommandResult:
+        completion = self.remove_prometheus_remote_write(url)
+        result = raise_if_exception(completion)
+        return HandleCommandResult(stdout=json.dumps(result))
+
     @OrchestratorCLICommand.Write('orch alertmanager set-credentials')
     def _set_alertmanager_access_info(self, username: Optional[str] = None, password: Optional[str] = None, inbuf: Optional[str] = None) -> HandleCommandResult:
         try:
index 246a9b898c6b8d87da3b1537ba2ac30e9608cda9..354d682a1e3054c31221d5e618271c54d724bef6 100644 (file)
@@ -973,6 +973,8 @@ class ServiceSpec(object):
                  preview_only: bool = False,
                  networks: Optional[List[str]] = None,
                  targets: Optional[List[str]] = None,
+                 remote_write_url: Optional[str] = None,
+                 remote_write_allowed_metrics: Optional[str] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -1021,6 +1023,8 @@ class ServiceSpec(object):
         #: :ref:`cephadm-rgw-networks` and :ref:`cephadm-mgr-networks`.
         self.networks: List[str] = networks or []
         self.targets: List[str] = targets or []
+        self.remote_write_url = remote_write_url
+        self.remote_write_allowed_metrics = remote_write_allowed_metrics
 
         self.config: Optional[Dict[str, str]] = None
         if config:
@@ -2894,6 +2898,8 @@ class MonitoringSpec(ServiceSpec):
                  preview_only: bool = False,
                  port: Optional[int] = None,
                  targets: Optional[List[str]] = None,
+                 remote_write_url: Optional[str] = None,
+                 remote_write_allowed_metrics: Optional[str] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -2908,7 +2914,9 @@ class MonitoringSpec(ServiceSpec):
             preview_only=preview_only, config=config,
             networks=networks, extra_container_args=extra_container_args,
             extra_entrypoint_args=extra_entrypoint_args,
-            custom_configs=custom_configs, targets=targets)
+            custom_configs=custom_configs, targets=targets,
+            remote_write_url=remote_write_url,
+            remote_write_allowed_metrics=remote_write_allowed_metrics)
 
         self.service_type = service_type
         self.port = port
@@ -3081,6 +3089,8 @@ class PrometheusSpec(MonitoringSpec):
                  retention_time: Optional[str] = None,
                  retention_size: Optional[str] = None,
                  targets: Optional[List[str]] = None,
+                 remote_write_url: Optional[str] = None,
+                 remote_write_allowed_metrics: Optional[str] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
                  custom_configs: Optional[List[CustomConfig]] = None,
@@ -3092,7 +3102,8 @@ class PrometheusSpec(MonitoringSpec):
             ssl=ssl, certificate_source=certificate_source,
             preview_only=preview_only, config=config, networks=networks, port=port, targets=targets,
             extra_container_args=extra_container_args, extra_entrypoint_args=extra_entrypoint_args,
-            custom_configs=custom_configs)
+            custom_configs=custom_configs, remote_write_url=remote_write_url,
+            remote_write_allowed_metrics=remote_write_allowed_metrics)
 
         self.retention_time = retention_time.strip() if retention_time else None
         self.retention_size = retention_size.strip() if retention_size else None