From e61c0e05676f8919703b33f7c63c904295a8185f Mon Sep 17 00:00:00 2001 From: Aashish Sharma Date: Sat, 6 Apr 2024 08:59:13 +0530 Subject: [PATCH] mgr/dashboard: add check if federate job is running else show reconnect message Signed-off-by: Aashish Sharma --- src/pybind/mgr/cephadm/module.py | 23 +++- src/pybind/mgr/cephadm/services/monitoring.py | 29 +++-- .../services/prometheus/prometheus.yml.j2 | 5 +- src/pybind/mgr/cephadm/tests/test_cephadm.py | 28 +++++ src/pybind/mgr/cephadm/tests/test_services.py | 37 +++++-- src/pybind/mgr/dashboard/controllers/auth.py | 2 +- .../dashboard/controllers/multi_cluster.py | 104 ++++++++++++------ .../multi-cluster-list.component.ts | 1 + .../multi-cluster.component.html | 19 ++++ .../multi-cluster/multi-cluster.component.ts | 66 ++++++++++- .../app/shared/api/multi-cluster.service.ts | 9 ++ .../app/shared/enum/dashboard-promqls.enum.ts | 7 +- src/pybind/mgr/orchestrator/_interface.py | 4 +- src/pybind/mgr/orchestrator/module.py | 2 +- 14 files changed, 264 insertions(+), 72 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 021ec23f135d4..03230f1a2df4f 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -12,6 +12,7 @@ from contextlib import contextmanager from functools import wraps from tempfile import TemporaryDirectory, NamedTemporaryFile from urllib.error import HTTPError +from urllib.parse import urlparse from threading import Event from ceph.deployment.service_spec import PrometheusSpec @@ -3184,12 +3185,12 @@ Then run the following: self.set_store(PrometheusService.USER_CFG_KEY, user) self.set_store(PrometheusService.PASS_CFG_KEY, password) return 'prometheus credentials updated correctly' - + @handle_orch_error def set_prometheus_cert(self, cert: str) -> str: self.set_store(PrometheusService.PROMETHEUS_CERT_CFG_KEY, cert) return 'prometheus cert stored correctly' - + @handle_orch_error def get_prometheus_cert(self) -> str: prometheus_cert = self.get_store(PrometheusService.PROMETHEUS_CERT_CFG_KEY) @@ -3205,9 +3206,21 @@ Then run the following: @handle_orch_error def set_prometheus_target(self, url: str) -> str: - valid_url_pattern = r"^(?!http:\/\/)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})$" - if re.match(valid_url_pattern, url) is None: - return f"Invalid URL '{url}'. It should be in the format host_ip:port" + try: + if url.startswith("http://") or url.startswith("https://"): + return f"Invalid URL '{url}'. It should be in the format host_ip:port" + + parsed_url_with_scheme = urlparse(f'http://{url}') + host = parsed_url_with_scheme.hostname + port = parsed_url_with_scheme.port + + if not host or port is None: + raise ValueError("Hostname or port is missing.") + + ipaddress.ip_address(host) + + except (ValueError, OSError) as e: + return f"Invalid URL. {e}" prometheus_spec = cast(PrometheusSpec, self.spec_store['prometheus'].spec) if url not in prometheus_spec.targets: prometheus_spec.targets.append(url) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 7127406dd5964..61f03fab2fc97 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -1,6 +1,7 @@ import errno import logging import json +import logging import os import socket from typing import List, Any, Tuple, Dict, Optional, cast @@ -512,10 +513,13 @@ class PrometheusService(CephadmService): FSID = self.mgr._cluster_fsid clusters_credentials = {} - multi_cluster_config_raw = str(self.mgr.get_module_option_ex('dashboard', 'MULTICLUSTER_CONFIG')) - multi_cluster_config_str = multi_cluster_config_raw.replace("'", '"') - valid_multi_cluster_config_str = multi_cluster_config_str.replace('True', '"True"').replace('False', '"False"') - multi_cluster_config = json.loads(valid_multi_cluster_config_str) + multi_cluster_config_str = str(self.mgr.get_module_option_ex('dashboard', 'MULTICLUSTER_CONFIG')) + try: + multi_cluster_config = json.loads(multi_cluster_config_str) + except json.JSONDecodeError as e: + multi_cluster_config = None + logger.error(f'Invalid JSON format for multi-cluster config: {e}') + if multi_cluster_config: for url in targets: credentials = self.find_prometheus_credentials(multi_cluster_config, url) @@ -560,10 +564,13 @@ class PrometheusService(CephadmService): if security_enabled: r2: Dict[str, Any] = {'files': {}} + unique_id_counter = 1 for url, credentials in clusters_credentials.items(): - r2['files'][f'prometheus_{url}_cert.crt'] = credentials['certificate'] - credentials['cert_file_name'] = f'prometheus_{url}_cert.crt' - context['clusters_credentials'] = clusters_credentials + unique_id = unique_id_counter + unique_id_counter += 1 + r2['files'][f'prometheus_{unique_id}_cert.crt'] = credentials['certificate'] + credentials['cert_file_name'] = f'prometheus_{unique_id}_cert.crt' + context['clusters_credentials'] = clusters_credentials # Following key/cert are needed for: # 1- run the prometheus server (web.yml config) # 2- use mTLS to scrape node-exporter (prometheus acts as client) @@ -694,13 +701,13 @@ class PrometheusService(CephadmService): return HandleCommandResult(-errno.EBUSY, '', warn_message) return HandleCommandResult(0, warn_message, '') - def find_prometheus_credentials(self, multicluster_config, url): - for cluster_id, clusters in multicluster_config['config'].items(): + def find_prometheus_credentials(self, multicluster_config: Dict[str, Any], url: str) -> Optional[Dict[str, Any]]: + for _, clusters in multicluster_config['config'].items(): for cluster in clusters: prometheus_url = cluster.get('prometheus_url') if prometheus_url: - valid_url = prometheus_url.replace("https://", "").replace("http://", "") - if valid_url == url: + valid_url = prometheus_url.replace("https://", "").replace("http://", "") # since target URLs are without scheme + if valid_url == url: # check if the target URL matches with the prometheus URL (without scheme) in the config return cluster.get('prometheus_access_info') return None diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index 3170e4ea2e91d..f45328efb6d6d 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -200,6 +200,10 @@ scrape_configs: scrape_interval: 15s honor_labels: true metrics_path: '/federate' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: {{ cluster_fsid }} {% if secure_monitoring_stack %} scheme: https tls_config: @@ -214,7 +218,6 @@ scrape_configs: - '{job="node"}' - '{job="haproxy"}' - '{job="ceph-exporter"}' - - '{job="nvmeof"}' static_configs: - targets: ['{{ url }}'] {% endfor %} diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index b3dc921ae5660..cca754f0e648d 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -159,6 +159,34 @@ class TestCephadm(object): new_mgr = cephadm_module.get_unique_name('mgr', 'myhost', existing) match_glob(new_mgr, 'myhost.*') + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) + def test_valid_url(self, cephadm_module): + # Test with valid URLs + test_cases = [ + ("192.168.100.100:9090", "prometheus multi-cluster targets updated"), + ("127.0.0.1:8080", "prometheus multi-cluster targets updated"), + ] + with with_host(cephadm_module, 'test'): + with with_service(cephadm_module, ServiceSpec(service_type='prometheus'), CephadmOrchestrator.apply_prometheus, 'test'): + for url, expected_output in test_cases: + c = cephadm_module.set_prometheus_target(url) + assert wait(cephadm_module, + c) == expected_output + + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) + def test_invalid_url(self, cephadm_module): + # Test with invalid URLs + test_cases = [ + ("http://example.com:9090", "Invalid URL 'http://example.com:9090'. It should be in the format host_ip:port"), + ("127.0.0.1:67700", "Invalid URL. Port out of range 0-65535") + ] + with with_host(cephadm_module, 'test'): + with with_service(cephadm_module, ServiceSpec(service_type='prometheus'), CephadmOrchestrator.apply_prometheus, 'test'): + for url, expected_output in test_cases: + c = cephadm_module.set_prometheus_target(url) + assert wait(cephadm_module, + c) == expected_output + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) def test_host(self, cephadm_module): assert wait(cephadm_module, cephadm_module.get_hosts()) == [] diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 08802cd5923dc..9bbb0ab588637 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -785,7 +785,6 @@ class TestMonitoring: scrape_configs: - job_name: 'ceph' - honor_labels: true relabel_configs: - source_labels: [__address__] target_label: cluster @@ -793,31 +792,32 @@ class TestMonitoring: - source_labels: [instance] target_label: instance replacement: 'ceph_cluster' + honor_labels: true http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus - job_name: 'node' - http_sd_configs: - - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter relabel_configs: - source_labels: [__address__] target_label: cluster replacement: fsid + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=node-exporter - job_name: 'haproxy' - http_sd_configs: - - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy relabel_configs: - source_labels: [__address__] target_label: cluster replacement: fsid + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=haproxy - job_name: 'ceph-exporter' - honor_labels: true relabel_configs: - source_labels: [__address__] target_label: cluster replacement: fsid + honor_labels: true http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter @@ -937,6 +937,8 @@ class TestMonitoring: global: scrape_interval: 10s evaluation_interval: 10s + external_labels: + cluster: fsid rule_files: - /etc/prometheus/alerting/* @@ -962,14 +964,17 @@ class TestMonitoring: scrape_configs: - job_name: 'ceph' - scheme: https - tls_config: - ca_file: root_cert.pem - honor_labels: true relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid - source_labels: [instance] target_label: instance replacement: 'ceph_cluster' + scheme: https + tls_config: + ca_file: root_cert.pem + honor_labels: true http_sd_configs: - url: https://[::1]:8765/sd/prometheus/sd-config?service=mgr-prometheus basic_auth: @@ -979,6 +984,10 @@ class TestMonitoring: ca_file: root_cert.pem - job_name: 'node' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid scheme: https tls_config: ca_file: root_cert.pem @@ -993,6 +1002,10 @@ class TestMonitoring: ca_file: root_cert.pem - job_name: 'haproxy' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid scheme: https tls_config: ca_file: root_cert.pem @@ -1005,6 +1018,10 @@ class TestMonitoring: ca_file: root_cert.pem - job_name: 'ceph-exporter' + relabel_configs: + - source_labels: [__address__] + target_label: cluster + replacement: fsid honor_labels: true scheme: https tls_config: diff --git a/src/pybind/mgr/dashboard/controllers/auth.py b/src/pybind/mgr/dashboard/controllers/auth.py index 2e6cf855c2977..baadad369593a 100644 --- a/src/pybind/mgr/dashboard/controllers/auth.py +++ b/src/pybind/mgr/dashboard/controllers/auth.py @@ -126,7 +126,7 @@ class Auth(RESTController, ControllerAuthMixin): ] } } - Settings.MULTICLUSTER_CONFIG = multicluster_config + Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config) return { 'token': token, 'username': username, diff --git a/src/pybind/mgr/dashboard/controllers/multi_cluster.py b/src/pybind/mgr/dashboard/controllers/multi_cluster.py index 1551f0969ff74..4d97ddd29e149 100644 --- a/src/pybind/mgr/dashboard/controllers/multi_cluster.py +++ b/src/pybind/mgr/dashboard/controllers/multi_cluster.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- import base64 +import ipaddress import json -import re import tempfile +import logging import time from typing import Any, Dict from urllib.parse import urlparse @@ -19,6 +20,8 @@ from ..tools import configure_cors from . import APIDoc, APIRouter, CreatePermission, DeletePermission, Endpoint, \ EndpointDoc, ReadPermission, RESTController, UIRouter, UpdatePermission +logger = logging.getLogger('controllers.multi_cluster') + @APIRouter('/multi-cluster', Scope.CONFIG_OPT) @APIDoc('Multi-cluster Management API', 'Multi-cluster') @@ -78,7 +81,8 @@ class MultiCluster(RESTController): 'ttl': ttl } cluster_token = self.check_cluster_connection(url, payload, username, - ssl_verify, ssl_certificate) + ssl_verify, ssl_certificate, + 'connect') cors_endpoints_string = self.get_cors_endpoints_string(hub_url) @@ -141,7 +145,8 @@ class MultiCluster(RESTController): cors_endpoints_string = ", ".join(cors_endpoints_set) return cors_endpoints_string - def check_cluster_connection(self, url, payload, username, ssl_verify, ssl_certificate): + def check_cluster_connection(self, url, payload, username, ssl_verify, ssl_certificate, + action): try: hub_cluster_version = mgr.version.split('ceph version ')[1] multi_cluster_content = self._proxy('GET', url, 'api/multi-cluster/get_config', @@ -185,7 +190,7 @@ class MultiCluster(RESTController): managed_by_clusters_config = managed_by_clusters_content['value'] - if len(managed_by_clusters_config) > 1: + if len(managed_by_clusters_config) > 1 and action == 'connect': raise DashboardException(msg='Cluster is already managed by another cluster', code='cluster_managed_by_another_cluster', component='multi-cluster') @@ -222,7 +227,7 @@ class MultiCluster(RESTController): "ssl_certificate": ssl_certificate if ssl_certificate else '', "prometheus_access_info": prometheus_access_info }] - Settings.MULTICLUSTER_CONFIG = multi_cluster_config + Settings.MULTICLUSTER_CONFIG = json.dumps(multi_cluster_config) def load_multi_cluster_config(self): if isinstance(Settings.MULTICLUSTER_CONFIG, str): @@ -242,7 +247,7 @@ class MultiCluster(RESTController): multicluster_config = self.load_multi_cluster_config() multicluster_config.update({'current_url': config['url']}) multicluster_config.update({'current_user': config['user']}) - Settings.MULTICLUSTER_CONFIG = multicluster_config + Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config) return Settings.MULTICLUSTER_CONFIG @Endpoint('PUT') @@ -259,9 +264,17 @@ class MultiCluster(RESTController): } cluster_token = self.check_cluster_connection(url, payload, username, - ssl_verify, ssl_certificate) + ssl_verify, ssl_certificate, + 'reconnect') + + prometheus_url = self._proxy('GET', url, 'api/multi-cluster/get_prometheus_api_url', + token=cluster_token) + + prometheus_access_info = self._proxy('GET', url, + 'ui-api/multi-cluster/get_prometheus_access_info', # noqa E501 #pylint: disable=line-too-long + token=cluster_token) - if username and cluster_token: + if username and cluster_token and prometheus_url and prometheus_access_info: if "config" in multicluster_config: for _, cluster_details in multicluster_config["config"].items(): for cluster in cluster_details: @@ -269,7 +282,11 @@ class MultiCluster(RESTController): cluster['token'] = cluster_token cluster['ssl_verify'] = ssl_verify cluster['ssl_certificate'] = ssl_certificate - Settings.MULTICLUSTER_CONFIG = multicluster_config + cluster['prometheus_access_info'] = prometheus_access_info + _remove_prometheus_targets(cluster['prometheus_url']) + time.sleep(5) + _set_prometheus_targets(prometheus_url) + Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config) return True @Endpoint('PUT') @@ -285,7 +302,7 @@ class MultiCluster(RESTController): cluster['cluster_alias'] = cluster_alias cluster['ssl_verify'] = verify cluster['ssl_certificate'] = ssl_certificate if verify else '' - Settings.MULTICLUSTER_CONFIG = multicluster_config + Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config) return Settings.MULTICLUSTER_CONFIG @Endpoint(method='DELETE') @@ -303,16 +320,9 @@ class MultiCluster(RESTController): cluster_token = value[0]['token'] cluster_ssl_certificate = value[0]['ssl_certificate'] cluster_ssl_verify = value[0]['ssl_verify'] - orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator') - try: - if orch_backend == 'cephadm': - cmd = { - 'prefix': 'orch prometheus remove-target', - 'url': value[0]['prometheus_url'].replace('http://', '').replace('https://', '') # noqa E501 #pylint: disable=line-too-long - } - mgr.mon_command(cmd) - except KeyError: - pass + cluster_prometheus_url = value[0]['prometheus_url'] + + _remove_prometheus_targets(cluster_prometheus_url) managed_by_clusters_content = self._proxy('GET', cluster_url, 'api/settings/MANAGED_BY_CLUSTERS', @@ -332,13 +342,14 @@ class MultiCluster(RESTController): del multicluster_config['config'][key] break - Settings.MULTICLUSTER_CONFIG = multicluster_config + Settings.MULTICLUSTER_CONFIG = json.dumps(multicluster_config) return Settings.MULTICLUSTER_CONFIG @Endpoint() @ReadPermission def get_config(self): - return Settings.MULTICLUSTER_CONFIG + multi_cluster_config = self.load_multi_cluster_config() + return multi_cluster_config def is_token_expired(self, jwt_token): split_message = jwt_token.split(".") @@ -388,8 +399,12 @@ class MultiCluster(RESTController): prometheus_url = Settings.PROMETHEUS_API_HOST if prometheus_url is not None: # check if is url is already in IP format - pattern = r'^(?:https?|http):\/\/(?:\d{1,3}\.){3}\d{1,3}:\d+$' - valid_ip_url = bool(re.match(pattern, prometheus_url)) + try: + url_parts = urlparse(prometheus_url) + ipaddress.ip_address(url_parts.hostname) + valid_ip_url = True + except ValueError: + valid_ip_url = False if not valid_ip_url: parsed_url = urlparse(prometheus_url) hostname = parsed_url.hostname @@ -408,7 +423,7 @@ class MultiClusterUi(RESTController): @UpdatePermission def set_cors_endpoint(self, url: str): configure_cors(url) - + @Endpoint('GET') @ReadPermission def get_prometheus_access_info(self): @@ -420,11 +435,11 @@ class MultiClusterUi(RESTController): cmd = { 'prefix': 'orch prometheus get-credentials', } - ret, out, _ = mgr.mon_command(cmd) - if ret == 0 and out is not None: - access_info = json.loads(out) - user = access_info['user'] - password = access_info['password'] + ret_status, out, _ = mgr.mon_command(cmd) + if ret_status == 0 and out is not None: + prom_access_info = json.loads(out) + user = prom_access_info['user'] + password = prom_access_info['password'] cert_cmd = { 'prefix': 'orch prometheus get-prometheus-cert', @@ -439,13 +454,30 @@ class MultiClusterUi(RESTController): 'password': password, 'certificate': prometheus_cert } + return None def _set_prometheus_targets(prometheus_url: str): orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator') - if orch_backend == 'cephadm': - cmd = { - 'prefix': 'orch prometheus set-target', - 'url': prometheus_url.replace('http://', '').replace('https://', '') - } - mgr.mon_command(cmd) + try: + if orch_backend == 'cephadm': + cmd = { + 'prefix': 'orch prometheus set-target', + 'url': prometheus_url.replace('http://', '').replace('https://', '') + } + mgr.mon_command(cmd) + except KeyError: + logger.exception('Failed to set prometheus targets') + + +def _remove_prometheus_targets(prometheus_url: str): + orch_backend = mgr.get_module_option_ex('orchestrator', 'orchestrator') + try: + if orch_backend == 'cephadm': + cmd = { + 'prefix': 'orch prometheus remove-target', + 'url': prometheus_url.replace('http://', '').replace('https://', '') + } + mgr.mon_command(cmd) + except KeyError: + logger.exception('Failed to remove prometheus targets') diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts index cfb3435425463..9f05ab668ab5f 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts @@ -50,6 +50,7 @@ export class MultiClusterListComponent extends ListWithDetails implements OnInit currentUrl: string; icons = Icons; managedByConfig$: Observable; + prometheusConnectionError: any[] = []; constructor( private multiClusterService: MultiClusterService, diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html index 6d0722dcfea70..7a3657a2aa906 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html +++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster.component.html @@ -66,6 +66,25 @@
+ +
+

Couldn't fetch metrics from the following clusters. Please reconnect the respective clusters to re-establish the prometheus connection -

+ + {{ cluster['cluster_alias']}} - {{ cluster['cluster_name'] }} + + +
+
; + clusterDetailsArray: any[]; + prometheusConnectionError: any[] = []; constructor( private multiClusterService: MultiClusterService, @@ -164,6 +167,7 @@ export class MultiClusterComponent implements OnInit, OnDestroy { this.subs.add( this.multiClusterService.subscribe((resp: any) => { this.isMultiCluster = Object.keys(resp['config']).length > 1; + this.clusterDetailsArray = Object.values(resp['config']).flat(); const hubUrl = resp['hub_url']; for (const key in resp['config']) { if (resp['config'].hasOwnProperty(key)) { @@ -243,7 +247,8 @@ export class MultiClusterComponent implements OnInit, OnDestroy { 'POOL_IOPS_UTILIZATION', 'POOL_THROUGHPUT_UTILIZATION', 'HOSTS', - 'CLUSTER_ALERTS' + 'CLUSTER_ALERTS', + 'FEDERATE_UP_METRIC' ]; let validSelectedQueries = allMultiClusterQueries; @@ -334,6 +339,10 @@ export class MultiClusterComponent implements OnInit, OnDestroy { const osds = this.findClusterData(this.queriesResults?.OSDS, clusterName); const status = this.findClusterData(this.queriesResults?.HEALTH_STATUS, clusterName); const available_capacity = totalCapacity - usedCapacity; + const federateMetrics = this.queriesResults?.FEDERATE_UP_METRIC.filter( + (metric: any) => metric.metric.job === 'federate' + ); + this.checkFederateMetricsStatus(federateMetrics); clusters.push({ cluster: clusterName.trim(), @@ -390,6 +399,59 @@ export class MultiClusterComponent implements OnInit, OnDestroy { ); } + checkFederateMetricsStatus(federateMetrics: any) { + this.prometheusConnectionError = []; + federateMetrics.forEach((entry1: { metric: { instance: any }; value: any }) => { + const instanceIpPort = entry1.metric.instance; + const instanceIp = instanceIpPort.split(':')[0]; + const instancePort = instanceIpPort.split(':')[1]; + const prometheus_federation_status = entry1.value[1]; + + this.clusterDetailsArray.forEach((entry2) => { + if (entry2['name'] !== this.localClusterName) { + const prometheusUrl = entry2['prometheus_url'] + .replace('http://', '') + .replace('https://', ''); + const prometheusIp = prometheusUrl.split(':')[0]; + const prometheusPort = prometheusUrl.split(':')[1]; + + if ( + instanceIp === prometheusIp && + instancePort === prometheusPort && + prometheus_federation_status === '0' + ) { + this.prometheusConnectionError.push({ + cluster_name: entry2.name, + cluster_alias: entry2.cluster_alias, + url: entry2.url, + user: entry2.user, + ssl_verify: entry2.ssl_verify, + ssl_certificate: entry2.ssl_certificate + }); + } + } + }); + }); + } + + openReconnectClusterForm(cluster: any) { + const initialState = { + action: 'reconnect', + cluster: cluster + }; + this.bsModalRef = this.modalService.show(MultiClusterFormComponent, initialState, { + size: 'lg' + }); + this.bsModalRef.componentInstance.submitAction.subscribe(() => { + this.loading = true; + setTimeout(() => { + const currentRoute = this.router.url.split('?')[0]; + this.multiClusterService.refreshMultiCluster(currentRoute); + this.getPrometheusData(this.prometheusService.lastHourDateObject); + }, this.PROMETHEUS_DELAY); + }); + } + findClusterData(metrics: any, clusterName: string) { const clusterMetrics = this.findCluster(metrics, clusterName); return parseInt(clusterMetrics?.value[1] || 0); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts index 9c2dcda4d8dc1..7cd1a76a8c1e9 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/multi-cluster.service.ts @@ -17,6 +17,8 @@ export class MultiClusterService { tokenStatusSource$ = this.tokenStatusSource.asObservable(); showDeletionMessage = false; isClusterAddedFlag = false; + prometheusConnectionError: any[] = []; + constructor( private http: HttpClient, private timerService: TimerService, @@ -219,6 +221,13 @@ export class MultiClusterService { return this.isClusterAddedFlag; } + managePrometheusConnectionError(prometheusConnectionError?: any[]) { + if (prometheusConnectionError !== undefined) { + this.prometheusConnectionError = prometheusConnectionError; + } + return this.prometheusConnectionError; + } + refreshMultiCluster(currentRoute: string) { this.refresh(); this.refreshTokenStatus(); diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts index 1a7c7af9d3a46..361a404a11b20 100644 --- a/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts +++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/enum/dashboard-promqls.enum.ts @@ -34,9 +34,10 @@ export enum MultiClusterPromqls { CRITICAL_ALERTS_COUNT = 'count(ALERTS{alertstate="firing",severity="critical"}) or vector(0)', WARNING_ALERTS_COUNT = 'count(ALERTS{alertstate="firing",severity="warning"}) or vector(0)', ALERTS = 'ALERTS{alertstate="firing"}', - HOSTS = 'count_values("hostname", ceph_mon_metadata) by (cluster) or vector (0)', - TOTAL_HOSTS = 'count(sum by (hostname) (ceph_osd_metadata)) or vector(0)', - CLUSTER_ALERTS = 'count by (cluster) (ALERTS{alertstate="firing"}) or vector(0)' + HOSTS = 'sum by (hostname, cluster) (group by (hostname, cluster) (ceph_osd_metadata)) or vector(0)', + TOTAL_HOSTS = 'count by (cluster) (ceph_osd_metadata) or vector(0)', + CLUSTER_ALERTS = 'count by (cluster) (ALERTS{alertstate="firing"}) or vector(0)', + FEDERATE_UP_METRIC = 'up' } export enum MultiClusterPromqlsForClusterUtilization { diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 7839e1e8386e7..c91c4d87c1586 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -791,8 +791,8 @@ class Orchestrator(object): """set alertmanager access information""" raise NotImplementedError() - def get_prometheus_cert(self, url: str) -> OrchResult[str]: - """set prometheus target for multi-cluster""" + def get_prometheus_cert(self) -> OrchResult[str]: + """get prometheus cert for multi-cluster""" raise NotImplementedError() def set_prometheus_access_info(self, user: str, password: str) -> OrchResult[str]: diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index d3ef1e59817ab..95b2843d3e339 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1232,7 +1232,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, completion = self.set_prometheus_target(url) result = raise_if_exception(completion) return HandleCommandResult(stdout=json.dumps(result)) - + @_cli_write_command('orch prometheus get-prometheus-cert') def _get_prometheus_cert(self) -> HandleCommandResult: completion = self.get_prometheus_cert() -- 2.39.5