]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
cephadm: Add pre_remove and ensure deployment values are reset and API settings are...
authorKushal Deb <Kushal.Deb@ibm.com>
Fri, 29 Nov 2024 08:38:51 +0000 (14:08 +0530)
committerKushal Deb <Kushal.Deb@ibm.com>
Tue, 4 Feb 2025 05:23:18 +0000 (10:53 +0530)
This fixes an issue where the dashboard API settings are not updated
properly when the active Prometheus or Alertmanager daemon is removed.
If the active daemon is removed, the settings are reconfigured to point
to a remaining daemon or reset if no daemons are available.

This avoids dashboard errors like "404 Not Found" caused by stale API
host settings.

Signed-off-by: Kushal Deb <Kushal.Deb@ibm.com>
src/pybind/mgr/cephadm/services/monitoring.py

index bd0620f595f5dc21a7874acc8adefe16e3a0ba38..8cdad5e7f5d20150e0160fce4dd6cd3efed282a8 100644 (file)
@@ -4,6 +4,8 @@ import os
 import socket
 from typing import List, Any, Tuple, Dict, Optional, cast, TYPE_CHECKING
 import ipaddress
+import time
+import requests
 
 from mgr_module import HandleCommandResult
 
@@ -442,6 +444,36 @@ class AlertmanagerService(CephadmService):
                 service_url
             )
 
+    def pre_remove(self, daemon: DaemonDescription) -> None:
+        """
+        Called before Alertmanager is removed
+        """
+        if daemon.hostname is None:
+            return
+        try:
+            current_api_host = self.mgr.check_mon_command({"prefix": "dashboard get-alertmanager-api-host"}).stdout.strip()
+            daemon_addr = daemon.ip if daemon.ip else self.mgr.get_fqdn(daemon.hostname)
+            daemon_port = daemon.ports[0] if daemon.ports else self.DEFAULT_SERVICE_PORT
+            service_url = build_url(scheme='http', host=daemon_addr, port=daemon_port)
+
+            if current_api_host == service_url:
+                # This is the active daemon, update or reset the settings
+                remaining_daemons = [
+                    d for d in self.mgr.cache.get_daemons_by_service(self.TYPE)
+                    if d.name() != daemon.name()
+                ]
+                if remaining_daemons:
+                    self.config_dashboard(remaining_daemons)
+                    logger.info("Updated dashboard API settings to point to a remaining Alertmanager daemon")
+                else:
+                    self.mgr.check_mon_command({"prefix": "dashboard reset-alertmanager-api-host"})
+                    self.mgr.check_mon_command({"prefix": "dashboard reset-alertmanager-api-ssl-verify"})
+                    logger.info("Reset dashboard API settings as no Alertmnager daemons are remaining")
+            else:
+                logger.info(f"Alertmanager {daemon.name()} removed; no changes to dashboard API settings")
+        except Exception as e:
+            logger.error(f"Error in Alertmanager pre_remove: {str(e)}")
+
     def ok_to_stop(self,
                    daemon_ids: List[str],
                    force: bool = False,
@@ -716,6 +748,48 @@ class PrometheusService(CephadmService):
                 service_url
             )
 
+    def pre_remove(self, daemon: DaemonDescription) -> None:
+        """
+        Called before Prometheus daemon is removed
+        """
+        MAX_RETRIES = 5
+        RETRY_INTERVAL = 5
+        if daemon.hostname is None:
+            return
+        try:
+            current_api_host = self.mgr.check_mon_command({"prefix": "dashboard get-prometheus-api-host"}).stdout.strip()
+            daemon_addr = daemon.ip if daemon.ip else self.mgr.get_fqdn(daemon.hostname)
+            daemon_port = daemon.ports[0] if daemon.ports else self.DEFAULT_SERVICE_PORT
+            service_url = build_url(scheme="http", host=daemon_addr, port=daemon_port)
+
+            if current_api_host == service_url:
+                remaining_daemons = [
+                    d for d in self.mgr.cache.get_daemons_by_service(self.TYPE)
+                    if d.name() != daemon.name()
+                ]
+                if remaining_daemons:
+                    self.config_dashboard(remaining_daemons)
+                    logger.info("Updated Dashboard Settings to point to remaining Prometheus daemons")
+                    for attempt in range(MAX_RETRIES):
+                        try:
+                            response = requests.get(f"{service_url}/api/v1/rules", timeout=5)
+                            if response.status_code == 200:
+                                logger.info(f"Prometheus daemon is ready at {service_url}.")
+                                break
+                        except Exception as e:
+                            logger.info(f"Retry {attempt + 1}: Waiting for Prometheus daemon at {service_url}: {e}")
+                        time.sleep(RETRY_INTERVAL)
+                    else:
+                        logger.warning("Prometheus daemon did not become ready after retries.")
+                else:
+                    self.mgr.check_mon_command({"prefix": "dashboard reset-prometheus-api-host"})
+                    self.mgr.check_mon_command({"prefix": "dashboard reset-prometheus-api-ssl-verify"})
+                    logger.info("Reset Prometheus API settings as no daemons are remaining")
+            else:
+                logger.info("Prometheus daemon removed; no changes to dashboard API settings")
+        except Exception as e:
+            logger.error(f"Error in Prometheus pre_remove {str(e)}")
+
     def ok_to_stop(self,
                    daemon_ids: List[str],
                    force: bool = False,