From e61c0e05676f8919703b33f7c63c904295a8185f Mon Sep 17 00:00:00 2001
From: Aashish Sharma
 <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Date: Sat, 6 Apr 2024 08:59:13 +0530
Subject: [PATCH] mgr/dashboard: add check if federate job is running else show
 reconnect message

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
---
 src/pybind/mgr/cephadm/module.py              |  23 +++-
 src/pybind/mgr/cephadm/services/monitoring.py |  29 +++--
 .../services/prometheus/prometheus.yml.j2     |   5 +-
 src/pybind/mgr/cephadm/tests/test_cephadm.py  |  28 +++++
 src/pybind/mgr/cephadm/tests/test_services.py |  37 +++++--
 src/pybind/mgr/dashboard/controllers/auth.py  |   2 +-
 .../dashboard/controllers/multi_cluster.py    | 104 ++++++++++++------
 .../multi-cluster-list.component.ts           |   1 +
 .../multi-cluster.component.html              |  19 ++++
 .../multi-cluster/multi-cluster.component.ts  |  66 ++++++++++-
 .../app/shared/api/multi-cluster.service.ts   |   9 ++
 .../app/shared/enum/dashboard-promqls.enum.ts |   7 +-
 src/pybind/mgr/orchestrator/_interface.py     |   4 +-
 src/pybind/mgr/orchestrator/module.py         |   2 +-
 14 files changed, 264 insertions(+), 72 deletions(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 021ec23f135d4..03230f1a2df4f 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -12,6 +12,7 @@ from contextlib import contextmanager
 from functools import wraps
 from tempfile import TemporaryDirectory, NamedTemporaryFile
 from urllib.error import HTTPError
+from urllib.parse import urlparse
 from threading import Event
 
 from ceph.deployment.service_spec import PrometheusSpec
@@ -3184,12 +3185,12 @@ Then run the following:
         self.set_store(PrometheusService.USER_CFG_KEY, user)
         self.set_store(PrometheusService.PASS_CFG_KEY, password)
         return 'prometheus credentials updated correctly'
-    
+
     @handle_orch_error
     def set_prometheus_cert(self, cert: str) -> str:
         self.set_store(PrometheusService.PROMETHEUS_CERT_CFG_KEY, cert)
         return 'prometheus cert stored correctly'
-    
+
     @handle_orch_error
     def get_prometheus_cert(self) -> str:
         prometheus_cert = self.get_store(PrometheusService.PROMETHEUS_CERT_CFG_KEY)
@@ -3205,9 +3206,21 @@ Then run the following:
 
     @handle_orch_error
     def set_prometheus_target(self, url: str) -> str:
-        valid_url_pattern = r"^(?!http:\/\/)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})$"
-        if re.match(valid_url_pattern, url) is None:
-            return f"Invalid URL '{url}'. It should be in the format host_ip:port"
+        try:
+            if url.startswith("http://") or url.startswith("https://"):
+                return f"Invalid URL '{url}'. It should be in the format host_ip:port"
+
+            parsed_url_with_scheme = urlparse(f'http://{url}')
+            host = parsed_url_with_scheme.hostname
+            port = parsed_url_with_scheme.port
+
+            if not host or port is None:
+                raise ValueError("Hostname or port is missing.")
+
+            ipaddress.ip_address(host)
+
+        except (ValueError, OSError) as e:
+            return f"Invalid URL. {e}"
         prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec)
         if url not in prometheus_spec.targets:
             prometheus_spec.targets.append(url)
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index 7127406dd5964..61f03fab2fc97 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -1,6 +1,7 @@
 import errno
 import logging
 import json
+import logging
 import os
 import socket
 from typing import List, Any, Tuple, Dict, Optional, cast
@@ -512,10 +513,13 @@ class PrometheusService(CephadmService):
         FSID = self.mgr._cluster_fsid
 
         clusters_credentials = {}
-        multi_cluster_config_raw = str(self.mgr.get_module_option_ex('dashboard', 'MULTICLUSTER_CONFIG'))
-        multi_cluster_config_str = multi_cluster_config_raw.replace("'", '"')
-        valid_multi_cluster_config_str = multi_cluster_config_str.replace('True', '"True"').replace('False', '"False"')
-        multi_cluster_config = json.loads(valid_multi_cluster_config_str)
+        multi_cluster_config_str = str(self.mgr.get_module_option_ex('dashboard', 'MULTICLUSTER_CONFIG'))
+        try:
+            multi_cluster_config = json.loads(multi_cluster_config_str)
+        except json.JSONDecodeError as e:
+            multi_cluster_config = None
+            logger.error(f'Invalid JSON format for multi-cluster config: {e}')
+
         if multi_cluster_config:
             for url in targets:
                 credentials = self.find_prometheus_credentials(multi_cluster_config, url)
@@ -560,10 +564,13 @@ class PrometheusService(CephadmService):
 
         if security_enabled:
             r2: Dict[str, Any] = {'files': {}}
+            unique_id_counter = 1
             for url, credentials in clusters_credentials.items():
-                r2['files'][f'prometheus_{url}_cert.crt'] = credentials['certificate']
-                credentials['cert_file_name'] = f'prometheus_{url}_cert.crt'
-                context['clusters_credentials'] = clusters_credentials
+                unique_id = unique_id_counter
+                unique_id_counter += 1
+                r2['files'][f'prometheus_{unique_id}_cert.crt'] = credentials['certificate']
+                credentials['cert_file_name'] = f'prometheus_{unique_id}_cert.crt'
+            context['clusters_credentials'] = clusters_credentials
             # Following key/cert are needed for:
             # 1- run the prometheus server (web.yml config)
             # 2- use mTLS to scrape node-exporter (prometheus acts as client)
@@ -694,13 +701,13 @@ class PrometheusService(CephadmService):
             return HandleCommandResult(-errno.EBUSY, '', warn_message)
         return HandleCommandResult(0, warn_message, '')
 
-    def find_prometheus_credentials(self, multicluster_config, url):
-        for cluster_id, clusters in multicluster_config['config'].items():
+    def find_prometheus_credentials(self, multicluster_config: Dict[str, Any], url: str) -> Optional[Dict[str, Any]]:
+        for _, clusters in multicluster_config['config'].items():
             for cluster in clusters:
                 prometheus_url = cluster.get('prometheus_url')
                 if prometheus_url:
-                    valid_url = prometheus_url.replace("https://", "").replace("http://", "")
-                    if valid_url == url:
+                    valid_url = prometheus_url.replace("https://", "").replace("http://", "")  # since target URLs are without scheme
+                    if valid_url == url:  # check if the target URL matches with the prometheus URL (without scheme) in the config
                         return cluster.get('prometheus_access_info')
         return None
 
diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
index 3170e4ea2e91d..f45328efb6d6d 100644
--- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
@@ -200,6 +200,10 @@ scrape_configs:
     scrape_interval: 15s
     honor_labels: true
     metrics_path: '/federate'
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: cluster
+      replacement: {{ cluster_fsid }}
 {% if secure_monitoring_stack %}
     scheme: https
     tls_config:
@@ -214,7 +218,6 @@ scrape_configs:
         - '{job="node"}'
         - '{job="haproxy"}'
         - '{job="ceph-exporter"}'
-        - '{job="nvmeof"}'
     static_configs:
     - targets: ['{{ url }}']
 {% endfor %}
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index b3dc921ae5660..cca754f0e648d 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -159,6 +159,34 @@ class TestCephadm(object):
         new_mgr = cephadm_module.get_unique_name('mgr', 'myhost', existing)
         match_glob(new_mgr, 'myhost.*')
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
+    def test_valid_url(self, cephadm_module):
+        # Test with valid URLs
+        test_cases = [
+            ("192.168.100.100:9090", "prometheus multi-cluster targets updated"),
+            ("127.0.0.1:8080", "prometheus multi-cluster targets updated"),
+        ]
+        with with_host(cephadm_module, 'test'):
+            with with_service(cephadm_module, ServiceSpec(service_type='prometheus'), CephadmOrchestrator.apply_prometheus, 'test'):
+                for url, expected_output in test_cases:
+                    c = cephadm_module.set_prometheus_target(url)
+                    assert wait(cephadm_module,
+                                c) == expected_output
+
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
+    def test_invalid_url(self, cephadm_module):
+        # Test with invalid URLs
+        test_cases = [
+            ("http://example.com:9090", "Invalid URL 'http://example.com:9090'. It should be in the format host_ip:port"),
+            ("127.0.0.1:67700", "Invalid URL. Port out of range 0-65535")
+        ]
+        with with_host(cephadm_module, 'test'):
+            with with_service(cephadm_module, ServiceSpec(service_type='prometheus'), CephadmOrchestrator.apply_prometheus, 'test'):
+                for url, expected_output in test_cases:
+                    c = cephadm_module.set_prometheus_target(url)
+                    assert wait(cephadm_module,
+                                c) == expected_output
+
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
     def test_host(self, cephadm_module):
         assert wait(cephadm_module, cephadm_module.get_hosts()) == []
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index 08802cd5923dc..9bbb0ab588637 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -785,7 +785,6 @@ class TestMonitoring:
 
                 scrape_configs:
                   - job_name: 'ceph'
-                    honor_labels: true
                     relabel_configs:
                     - source_labels: [__address__]
                       target_label: cluster
@@ -793,31 +792,32 @@ class TestMonitoring:
                     - source_labels: [instance]
                       target_label: instance
                       replacement: 'ceph_cluster'
+                    honor_labels: true
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus
 
                   - job_name: 'node'
-                    http_sd_configs:
-                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
                     relabel_configs:
                     - source_labels: [__address__]
                       target_label: cluster
                       replacement: fsid
+                    http_sd_configs:
+                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter
 
                   - job_name: 'haproxy'
-                    http_sd_configs:
-                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy
                     relabel_configs:
                     - source_labels: [__address__]
                       target_label: cluster
                       replacement: fsid
+                    http_sd_configs:
+                    - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy
 
                   - job_name: 'ceph-exporter'
-                    honor_labels: true
                     relabel_configs:
                     - source_labels: [__address__]
                       target_label: cluster
                       replacement: fsid
+                    honor_labels: true
                     http_sd_configs:
                     - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter
 
@@ -937,6 +937,8 @@ class TestMonitoring:
                 global:
                   scrape_interval: 10s
                   evaluation_interval: 10s
+                  external_labels:
+                    cluster: fsid
 
                 rule_files:
                   - /etc/prometheus/alerting/*
@@ -962,14 +964,17 @@ class TestMonitoring:
 
                 scrape_configs:
                   - job_name: 'ceph'
-                    scheme: https
-                    tls_config:
-                      ca_file: root_cert.pem
-                    honor_labels: true
                     relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     - source_labels: [instance]
                       target_label: instance
                       replacement: 'ceph_cluster'
+                    scheme: https
+                    tls_config:
+                      ca_file: root_cert.pem
+                    honor_labels: true
                     http_sd_configs:
                     - url: https://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus
                       basic_auth:
@@ -979,6 +984,10 @@ class TestMonitoring:
                         ca_file: root_cert.pem
 
                   - job_name: 'node'
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     scheme: https
                     tls_config:
                       ca_file: root_cert.pem
@@ -993,6 +1002,10 @@ class TestMonitoring:
                         ca_file: root_cert.pem
 
                   - job_name: 'haproxy'
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     scheme: https
                     tls_config:
                       ca_file: root_cert.pem
@@ -1005,6 +1018,10 @@ class TestMonitoring:
                         ca_file: root_cert.pem
 
                   - job_name: 'ceph-exporter'
+                    relabel_configs:
+                    - source_labels: [__address__]
+                      target_label: cluster
+                      replacement: fsid
                     honor_labels: true
                     scheme: https
                     tls_config:
diff --git a/src/pybind/mgr/dashboard/controllers/auth.py b/src/pybind/mgr/dashboard/controllers/auth.py
index 2e6cf855c2977..baadad369593a 100644
--- a/src/pybind/mgr/dashboard/controllers/auth.py
+++ b/src/pybind/mgr/dashboard/controllers/auth.py
@@ -126,7 +126,7 @@ class Auth(RESTController, ControllerAuthMixin):
                             ]
                         }
                     }
-                Settings.MULTICLUSTER_CONFIG = multicluster_config
+                Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config)
                 return {
                     'token': token,
                     'username': username,
diff --git a/src/pybind/mgr/dashboard/controllers/multi_cluster.py b/src/pybind/mgr/dashboard/controllers/multi_cluster.py
index 1551f0969ff74..4d97ddd29e149 100644
--- a/src/pybind/mgr/dashboard/controllers/multi_cluster.py
+++ b/src/pybind/mgr/dashboard/controllers/multi_cluster.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
 
 import base64
+import ipaddress
 import json
-import re
 import tempfile
+import logging
 import time
 from typing import Any, Dict
 from urllib.parse import urlparse
@@ -19,6 +20,8 @@ from ..tools import configure_cors
 from . import APIDoc, APIRouter, CreatePermission, DeletePermission, Endpoint, \
     EndpointDoc, ReadPermission, RESTController, UIRouter, UpdatePermission
 
+logger = logging.getLogger('controllers.multi_cluster')
+
 
 @APIRouter('/multi-cluster', Scope.CONFIG_OPT)
 @APIDoc('Multi-cluster Management API', 'Multi-cluster')
@@ -78,7 +81,8 @@ class MultiCluster(RESTController):
                 'ttl': ttl
             }
             cluster_token = self.check_cluster_connection(url, payload, username,
-                                                          ssl_verify, ssl_certificate)
+                                                          ssl_verify, ssl_certificate,
+                                                          'connect')
 
             cors_endpoints_string = self.get_cors_endpoints_string(hub_url)
 
@@ -141,7 +145,8 @@ class MultiCluster(RESTController):
         cors_endpoints_string = ", ".join(cors_endpoints_set)
         return cors_endpoints_string
 
-    def check_cluster_connection(self, url, payload, username, ssl_verify, ssl_certificate):
+    def check_cluster_connection(self, url, payload, username, ssl_verify, ssl_certificate,
+                                 action):
         try:
             hub_cluster_version = mgr.version.split('ceph version ')[1]
             multi_cluster_content = self._proxy('GET', url, 'api/multi-cluster/get_config',
@@ -185,7 +190,7 @@ class MultiCluster(RESTController):
 
         managed_by_clusters_config = managed_by_clusters_content['value']
 
-        if len(managed_by_clusters_config) > 1:
+        if len(managed_by_clusters_config) > 1 and action == 'connect':
             raise DashboardException(msg='Cluster is already managed by another cluster',
                                      code='cluster_managed_by_another_cluster',
                                      component='multi-cluster')
@@ -222,7 +227,7 @@ class MultiCluster(RESTController):
                 "ssl_certificate": ssl_certificate if ssl_certificate else '',
                 "prometheus_access_info": prometheus_access_info
             }]
-        Settings.MULTICLUSTER_CONFIG = multi_cluster_config
+        Settings.MULTICLUSTER_CONFIG = json.dumps(multi_cluster_config)
 
     def load_multi_cluster_config(self):
         if isinstance(Settings.MULTICLUSTER_CONFIG, str):
@@ -242,7 +247,7 @@ class MultiCluster(RESTController):
         multicluster_config = self.load_multi_cluster_config()
         multicluster_config.update({'current_url': config['url']})
         multicluster_config.update({'current_user': config['user']})
-        Settings.MULTICLUSTER_CONFIG = multicluster_config
+        Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config)
         return Settings.MULTICLUSTER_CONFIG
 
     @Endpoint('PUT')
@@ -259,9 +264,17 @@ class MultiCluster(RESTController):
             }
 
             cluster_token = self.check_cluster_connection(url, payload, username,
-                                                          ssl_verify, ssl_certificate)
+                                                          ssl_verify, ssl_certificate,
+                                                          'reconnect')
+
+            prometheus_url = self._proxy('GET', url, 'api/multi-cluster/get_prometheus_api_url',
+                                         token=cluster_token)
+
+            prometheus_access_info = self._proxy('GET', url,
+                                                 'ui-api/multi-cluster/get_prometheus_access_info',  # noqa E501 #pylint: disable=line-too-long
+                                                 token=cluster_token)
 
-        if username and cluster_token:
+        if username and cluster_token and prometheus_url and prometheus_access_info:
             if "config" in multicluster_config:
                 for _, cluster_details in multicluster_config["config"].items():
                     for cluster in cluster_details:
@@ -269,7 +282,11 @@ class MultiCluster(RESTController):
                             cluster['token'] = cluster_token
                             cluster['ssl_verify'] = ssl_verify
                             cluster['ssl_certificate'] = ssl_certificate
-            Settings.MULTICLUSTER_CONFIG = multicluster_config
+                            cluster['prometheus_access_info'] = prometheus_access_info
+                            _remove_prometheus_targets(cluster['prometheus_url'])
+                            time.sleep(5)
+                            _set_prometheus_targets(prometheus_url)
+            Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config)
         return True
 
     @Endpoint('PUT')
@@ -285,7 +302,7 @@ class MultiCluster(RESTController):
                         cluster['cluster_alias'] = cluster_alias
                         cluster['ssl_verify'] = verify
                         cluster['ssl_certificate'] = ssl_certificate if verify else ''
-        Settings.MULTICLUSTER_CONFIG = multicluster_config
+        Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config)
         return Settings.MULTICLUSTER_CONFIG
 
     @Endpoint(method='DELETE')
@@ -303,16 +320,9 @@ class MultiCluster(RESTController):
                     cluster_token = value[0]['token']
                     cluster_ssl_certificate = value[0]['ssl_certificate']
                     cluster_ssl_verify = value[0]['ssl_verify']
-                    orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator')
-                    try:
-                        if orch_backend == 'cephadm':
-                            cmd = {
-                                'prefix': 'orch prometheus remove-target',
-                                'url': value[0]['prometheus_url'].replace('http://', '').replace('https://', '')  # noqa E501 #pylint: disable=line-too-long
-                            }
-                            mgr.mon_command(cmd)
-                    except KeyError:
-                        pass
+                    cluster_prometheus_url = value[0]['prometheus_url']
+
+                    _remove_prometheus_targets(cluster_prometheus_url)
 
                     managed_by_clusters_content = self._proxy('GET', cluster_url,
                                                               'api/settings/MANAGED_BY_CLUSTERS',
@@ -332,13 +342,14 @@ class MultiCluster(RESTController):
                     del multicluster_config['config'][key]
                     break
 
-        Settings.MULTICLUSTER_CONFIG = multicluster_config
+        Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config)
         return Settings.MULTICLUSTER_CONFIG
 
     @Endpoint()
     @ReadPermission
     def get_config(self):
-        return Settings.MULTICLUSTER_CONFIG
+        multi_cluster_config = self.load_multi_cluster_config()
+        return multi_cluster_config
 
     def is_token_expired(self, jwt_token):
         split_message = jwt_token.split(".")
@@ -388,8 +399,12 @@ class MultiCluster(RESTController):
         prometheus_url = Settings.PROMETHEUS_API_HOST
         if prometheus_url is not None:
             # check if is url is already in IP format
-            pattern = r'^(?:https?|http):\/\/(?:\d{1,3}\.){3}\d{1,3}:\d+$'
-            valid_ip_url = bool(re.match(pattern, prometheus_url))
+            try:
+                url_parts = urlparse(prometheus_url)
+                ipaddress.ip_address(url_parts.hostname)
+                valid_ip_url = True
+            except ValueError:
+                valid_ip_url = False
             if not valid_ip_url:
                 parsed_url = urlparse(prometheus_url)
                 hostname = parsed_url.hostname
@@ -408,7 +423,7 @@ class MultiClusterUi(RESTController):
     @UpdatePermission
     def set_cors_endpoint(self, url: str):
         configure_cors(url)
-    
+
     @Endpoint('GET')
     @ReadPermission
     def get_prometheus_access_info(self):
@@ -420,11 +435,11 @@ class MultiClusterUi(RESTController):
             cmd = {
                 'prefix': 'orch prometheus get-credentials',
             }
-            ret, out, _ = mgr.mon_command(cmd)
-            if ret == 0 and out is not None:
-                access_info = json.loads(out)
-                user = access_info['user']
-                password = access_info['password']
+            ret_status, out, _ = mgr.mon_command(cmd)
+            if ret_status == 0 and out is not None:
+                prom_access_info = json.loads(out)
+                user = prom_access_info['user']
+                password = prom_access_info['password']
 
             cert_cmd = {
                 'prefix': 'orch prometheus get-prometheus-cert',
@@ -439,13 +454,30 @@ class MultiClusterUi(RESTController):
                 'password': password,
                 'certificate': prometheus_cert
             }
+        return None
 
 
 def _set_prometheus_targets(prometheus_url: str):
     orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator')
-    if orch_backend == 'cephadm':
-        cmd = {
-            'prefix': 'orch prometheus set-target',
-            'url': prometheus_url.replace('http://', '').replace('https://', '')
-        }
-        mgr.mon_command(cmd)
+    try:
+        if orch_backend == 'cephadm':
+            cmd = {
+                'prefix': 'orch prometheus set-target',
+                'url': prometheus_url.replace('http://', '').replace('https://', '')
+            }
+            mgr.mon_command(cmd)
+    except KeyError:
+        logger.exception('Failed to set prometheus targets')
+
+
+def _remove_prometheus_targets(prometheus_url: str):
+    orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator')
+    try:
+        if orch_backend == 'cephadm':
+            cmd = {
+                'prefix': 'orch prometheus remove-target',
+                'url': prometheus_url.replace('http://', '').replace('https://', '')
+            }
+            mgr.mon_command(cmd)
+    except KeyError:
+        logger.exception('Failed to remove prometheus targets')
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
index cfb3435425463..9f05ab668ab5f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
@@ -50,6 +50,7 @@ export class MultiClusterListComponent extends ListWithDetails implements OnInit
   currentUrl: string;
   icons = Icons;
   managedByConfig$: Observable<any>;
+  prometheusConnectionError: any[] = [];
 
   constructor(
     private multiClusterService: MultiClusterService,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html
index 6d0722dcfea70..7a3657a2aa906 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html
@@ -66,6 +66,25 @@
   <div class="container-fluid h-100 p-4"
        *ngIf="isMultiCluster && managedByConfig['MANAGED_BY_CLUSTERS'].length === 0; else emptyCluster">
     <ng-container *ngIf="!loading; else loadingTpl">
+      <cd-alert-panel type="error"
+                      spacingClass="mb-3"
+                      [showTitle]="false"
+                      size="slim"
+                      *ngIf="prometheusConnectionError.length > 0"
+                      (dismissed)="onDismissed()"
+                      [dismissible]="true"
+                      i18n>
+        <div>
+          <p>Couldn't fetch metrics from the following clusters. Please reconnect the respective clusters to re-establish the prometheus connection - <br></p>
+          <span *ngFor="let cluster of prometheusConnectionError">
+            {{ cluster['cluster_alias']}} - {{ cluster['cluster_name'] }}
+            <button class="btn btn-primary btn-sm"
+                    type="button"
+                    (click)="openReconnectClusterForm(cluster)">Reconnect
+            </button>
+          </span>
+        </div>
+      </cd-alert-panel>
       <cd-alert-panel type="info"
                       spacingClass="mb-3"
                       [showTitle]="false"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.ts
index c6c6a7c963daa..dc13d4c09ff1a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.ts
@@ -52,7 +52,8 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
     POOLS: 0,
     OSDS: 0,
     CLUSTER_ALERTS: 0,
-    version: ''
+    version: '',
+    FEDERATE_UP_METRIC: 0
   };
   alerts: any;
 
@@ -90,6 +91,8 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
   selectedTime: any;
   multiClusterQueries: any = {};
   managedByConfig$: Observable<any>;
+  clusterDetailsArray: any[];
+  prometheusConnectionError: any[] = [];
 
   constructor(
     private multiClusterService: MultiClusterService,
@@ -164,6 +167,7 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
     this.subs.add(
       this.multiClusterService.subscribe((resp: any) => {
         this.isMultiCluster = Object.keys(resp['config']).length > 1;
+        this.clusterDetailsArray = Object.values(resp['config']).flat();
         const hubUrl = resp['hub_url'];
         for (const key in resp['config']) {
           if (resp['config'].hasOwnProperty(key)) {
@@ -243,7 +247,8 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
       'POOL_IOPS_UTILIZATION',
       'POOL_THROUGHPUT_UTILIZATION',
       'HOSTS',
-      'CLUSTER_ALERTS'
+      'CLUSTER_ALERTS',
+      'FEDERATE_UP_METRIC'
     ];
 
     let validSelectedQueries = allMultiClusterQueries;
@@ -334,6 +339,10 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
       const osds = this.findClusterData(this.queriesResults?.OSDS, clusterName);
       const status = this.findClusterData(this.queriesResults?.HEALTH_STATUS, clusterName);
       const available_capacity = totalCapacity - usedCapacity;
+      const federateMetrics = this.queriesResults?.FEDERATE_UP_METRIC.filter(
+        (metric: any) => metric.metric.job === 'federate'
+      );
+      this.checkFederateMetricsStatus(federateMetrics);
 
       clusters.push({
         cluster: clusterName.trim(),
@@ -390,6 +399,59 @@ export class MultiClusterComponent implements OnInit, OnDestroy {
     );
   }
 
+  checkFederateMetricsStatus(federateMetrics: any) {
+    this.prometheusConnectionError = [];
+    federateMetrics.forEach((entry1: { metric: { instance: any }; value: any }) => {
+      const instanceIpPort = entry1.metric.instance;
+      const instanceIp = instanceIpPort.split(':')[0];
+      const instancePort = instanceIpPort.split(':')[1];
+      const prometheus_federation_status = entry1.value[1];
+
+      this.clusterDetailsArray.forEach((entry2) => {
+        if (entry2['name'] !== this.localClusterName) {
+          const prometheusUrl = entry2['prometheus_url']
+            .replace('http://', '')
+            .replace('https://', '');
+          const prometheusIp = prometheusUrl.split(':')[0];
+          const prometheusPort = prometheusUrl.split(':')[1];
+
+          if (
+            instanceIp === prometheusIp &&
+            instancePort === prometheusPort &&
+            prometheus_federation_status === '0'
+          ) {
+            this.prometheusConnectionError.push({
+              cluster_name: entry2.name,
+              cluster_alias: entry2.cluster_alias,
+              url: entry2.url,
+              user: entry2.user,
+              ssl_verify: entry2.ssl_verify,
+              ssl_certificate: entry2.ssl_certificate
+            });
+          }
+        }
+      });
+    });
+  }
+
+  openReconnectClusterForm(cluster: any) {
+    const initialState = {
+      action: 'reconnect',
+      cluster: cluster
+    };
+    this.bsModalRef = this.modalService.show(MultiClusterFormComponent, initialState, {
+      size: 'lg'
+    });
+    this.bsModalRef.componentInstance.submitAction.subscribe(() => {
+      this.loading = true;
+      setTimeout(() => {
+        const currentRoute = this.router.url.split('?')[0];
+        this.multiClusterService.refreshMultiCluster(currentRoute);
+        this.getPrometheusData(this.prometheusService.lastHourDateObject);
+      }, this.PROMETHEUS_DELAY);
+    });
+  }
+
   findClusterData(metrics: any, clusterName: string) {
     const clusterMetrics = this.findCluster(metrics, clusterName);
     return parseInt(clusterMetrics?.value[1] || 0);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts
index 9c2dcda4d8dc1..7cd1a76a8c1e9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts
@@ -17,6 +17,8 @@ export class MultiClusterService {
   tokenStatusSource$ = this.tokenStatusSource.asObservable();
   showDeletionMessage = false;
   isClusterAddedFlag = false;
+  prometheusConnectionError: any[] = [];
+
   constructor(
     private http: HttpClient,
     private timerService: TimerService,
@@ -219,6 +221,13 @@ export class MultiClusterService {
     return this.isClusterAddedFlag;
   }
 
+  managePrometheusConnectionError(prometheusConnectionError?: any[]) {
+    if (prometheusConnectionError !== undefined) {
+      this.prometheusConnectionError = prometheusConnectionError;
+    }
+    return this.prometheusConnectionError;
+  }
+
   refreshMultiCluster(currentRoute: string) {
     this.refresh();
     this.refreshTokenStatus();
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts
index 1a7c7af9d3a46..361a404a11b20 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts
@@ -34,9 +34,10 @@ export enum MultiClusterPromqls {
   CRITICAL_ALERTS_COUNT = 'count(ALERTS{alertstate="firing",severity="critical"}) or vector(0)',
   WARNING_ALERTS_COUNT = 'count(ALERTS{alertstate="firing",severity="warning"}) or vector(0)',
   ALERTS = 'ALERTS{alertstate="firing"}',
-  HOSTS = 'count_values("hostname", ceph_mon_metadata) by (cluster) or vector (0)',
-  TOTAL_HOSTS = 'count(sum by (hostname) (ceph_osd_metadata)) or vector(0)',
-  CLUSTER_ALERTS = 'count by (cluster) (ALERTS{alertstate="firing"}) or vector(0)'
+  HOSTS = 'sum by (hostname, cluster) (group by (hostname, cluster) (ceph_osd_metadata)) or vector(0)',
+  TOTAL_HOSTS = 'count by (cluster) (ceph_osd_metadata) or vector(0)',
+  CLUSTER_ALERTS = 'count by (cluster) (ALERTS{alertstate="firing"}) or vector(0)',
+  FEDERATE_UP_METRIC = 'up'
 }
 
 export enum MultiClusterPromqlsForClusterUtilization {
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 7839e1e8386e7..c91c4d87c1586 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -791,8 +791,8 @@ class Orchestrator(object):
         """set alertmanager access information"""
         raise NotImplementedError()
 
-    def get_prometheus_cert(self, url: str) -> OrchResult[str]:
-        """set prometheus target for multi-cluster"""
+    def get_prometheus_cert(self) -> OrchResult[str]:
+        """get prometheus cert for multi-cluster"""
         raise NotImplementedError()
 
     def set_prometheus_access_info(self, user: str, password: str) -> OrchResult[str]:
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index d3ef1e59817ab..95b2843d3e339 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -1232,7 +1232,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
         completion = self.set_prometheus_target(url)
         result = raise_if_exception(completion)
         return HandleCommandResult(stdout=json.dumps(result))
-    
+
     @_cli_write_command('orch prometheus get-prometheus-cert')
     def _get_prometheus_cert(self) -> HandleCommandResult:
         completion = self.get_prometheus_cert()
-- 
2.39.5