From 23d35f608dd642eddeab83c813f4db9bd2707ae9 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Juan=20Miguel=20Olmo=20Mart=C3=ADnez?= <jolmomar@ibm.com>
Date: Mon, 24 Jun 2024 12:51:33 +0200
Subject: [PATCH] mgr/callhome: ISCE-739 Support UI - Call Home info
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Resolves: rhbz#2303389

Signed-off-by: Juan Miguel Olmo MartÃ­nez <jolmomar@ibm.com>
(cherry picked from commit 4d457b9f8314478ee12ace9944d4dce1b51ca6fe)
---
 src/pybind/mgr/call_home_agent/module.py      | 115 +++++++++++++++---
 .../mgr/call_home_agent/tests/test_agent.py   |  21 +++-
 2 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/src/pybind/mgr/call_home_agent/module.py b/src/pybind/mgr/call_home_agent/module.py
index 8684148881d..9e0085c59d5 100644
--- a/src/pybind/mgr/call_home_agent/module.py
+++ b/src/pybind/mgr/call_home_agent/module.py
@@ -56,6 +56,24 @@ class SendError(Exception):
 # Prometheus API.
 sent_alerts = {}
 
+def get_prometheus_url(mgr: Any) -> str:
+    """
+    Provides the prometheus server URL
+    """
+    daemon_list = mgr.remote('cephadm', 'list_daemons', service_name='prometheus')
+    if daemon_list.exception_str:
+        raise Exception(f"Alert report: Error finding the Prometheus instance: {daemon_list.exception_str}")
+    if len(daemon_list.result) < 1:
+        raise Exception(f"Alert report: Can't find the Prometheus instance")
+
+    d = daemon_list.result[0]
+    host = d.ip if d.ip else d.hostname  # ip is of type str
+    port = str(d.ports[0]) if d.ports else ""  # ports is a list of ints
+    if not (host and port):
+        raise Exception(f"Can't get Prometheus IP and/or port from manager")
+
+    return f"http://{host}:{port}/api/v1"
+
 def get_status(mgr: Any) -> dict:
     r, outb, outs = mgr.mon_command({
         'prefix': 'status',
@@ -69,11 +87,90 @@ def get_status(mgr: Any) -> dict:
         status_dict = json.loads(outb)
         status_dict["ceph_version"] = mgr.version
         status_dict["health_detail"] = json.loads(mgr.get('health')['json'])
+        status_dict["support"] = get_support_metrics(mgr)
+        status_dict["support"]["health_status"] = status_dict["health_detail"]["status"]
+        status_dict["support"]["health_summary"] = get_health_summary(status_dict["health_detail"])
         return status_dict
     except Exception as ex:
         mgr.log.exception(str(ex))
         return {'exception': str(ex)}
 
+def get_health_summary(ceph_health: dict) -> str:
+    health_summary = ""
+    for error_key, error_details in ceph_health["checks"].items():
+        msg = "\n".join([item["message"] for item in error_details.get("detail",[])])
+        health_summary += f'{error_key}({error_details["severity"]}): {error_details["summary"]["message"]}\n{msg}\n'
+    return health_summary
+
+def get_support_metrics(mgr) -> dict:
+    """
+    Collect cluster metrics needed for Ceph support team tools
+    """
+    support_metrics = {}
+    status_interval_minutes = os.environ.get('CHA_INTERVAL_STATUS_REPORT_SECONDS',
+                              mgr.get_module_option('interval_status_report_seconds'))
+    try:
+        query_url = f"{get_prometheus_url(mgr)}/query"
+        queries = {
+            'total_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
+            'total_raw_usage_bytes': 'sum(ceph_osd_stat_bytes_used)',
+            'usage_percentage': '(sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)) * 100',
+            'slow_ops_total': 'sum(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})',
+            'osds_total_with_slow_ops': 'count(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}>0) or on() vector(0)',
+            'pg_total': 'sum(ceph_pg_total)',
+            'pg_active': 'sum(ceph_pg_active)',
+            'pg_clean': 'sum(ceph_pg_clean)',
+            'pg_degraded': 'sum(ceph_pg_degraded)',
+            'pg_unknown': 'sum(ceph_pg_unknown)',
+            'pg_down': 'sum(ceph_pg_down)',
+            'pg_scrubbing': 'sum(ceph_pg_scrubbing)',
+            'pg_deep_scrubbing': 'sum(ceph_pg_deep)',
+            'network_receive_errors': f'avg(increase(node_network_receive_errs_total{{device!="lo"}}[{status_interval_minutes}m]))',
+            'network_send_errors': f'avg(increase(node_network_transmit_errs_total{{device!="lo"}}[{status_interval_minutes}m]))',
+            'network_receive_packet_drops': f'avg(increase(node_network_receive_drop_total{{device!="lo"}}[{status_interval_minutes}m]))',
+            'network_transmit_packet_drops': f'avg(increase(node_network_transmit_drop_total{{device!="lo"}}[{status_interval_minutes}m]))',
+            'inconsistent_mtu': 'sum(node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=  quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))) or vector(0))',
+            'pool_number': 'count(ceph_pool_bytes_used)',
+            'raw_capacity_bytes': 'sum(ceph_osd_stat_bytes)',
+            'raw_capacity_consumed_bytes': 'sum(ceph_pool_bytes_used)',
+            'logical_stored_bytes': 'sum(ceph_pool_stored)',
+            'pool_growth_bytes': f'sum(delta(ceph_pool_stored[{status_interval_minutes}m]))',
+            'pool_bandwidth_bytes': f'sum(rate(ceph_pool_rd_bytes[{status_interval_minutes}m]) + rate(ceph_pool_wr_bytes[{status_interval_minutes}m]))',
+            'pg_per_osd_ratio':'(avg(ceph_osd_numpg)/sum(ceph_pg_total))*100',
+            'monitors_number': 'count(ceph_mon_metadata)',
+            'monitors_not_in_quorum_number': 'count(ceph_mon_quorum_status!=1) or on() vector(0)',
+            'clock_skews_number': 'ceph_health_detail{name="MON_CLOCK_SKEW"} or on() vector(0)',
+        }
+
+        t1 = time.time()
+        for k,q in queries.items():
+            data = exec_prometheus_query(query_url, q)
+            try:
+                support_metrics[k] = float(data['data']['result'][0]['value'][1])
+            except Exception as ex:
+                 mgr.log.error(f"Error reading status metric for support <{k}>: {ex} - {data}")
+        total_time = round((time.time() - t1) * 1000, 2)
+        support_metrics['time_to_get_support_data_ms'] = total_time
+        mgr.log.info(f"Time to get support data for status report: {total_time} ms")
+    except Exception as ex:
+        mgr.log.error(f"Error collecting support data for status report: {ex}")
+
+    return support_metrics
+
+def exec_prometheus_query(query_url: str, prom_query: str) -> dict:
+    """
+    Execute a Prometheus query and returns the result as dict
+    """
+    result = {}
+    r = None
+    try:
+        r = requests.get(query_url, params={'query': prom_query})
+        result = json.loads(r.text)
+        r.raise_for_status()
+    except Exception as ex:
+        raise Exception(f"Error executing Prometheus query: {ex}-{result}")
+    return result
+
 def inventory_get_hardware_status(mgr: Any) -> dict:
     try:
         hw_status = mgr.remote('orchestrator', 'node_proxy_summary')
@@ -561,24 +658,13 @@ def get_prometheus_alerts(mgr):
     Returns a list of all the alerts currently active in Prometheus
     """
     try:
-        daemon_list = mgr.remote('cephadm', 'list_daemons', service_name='prometheus')
-        if daemon_list.exception_str:
-            raise Exception(f"Alert report: Error finding the Prometheus instance: {daemon_list.exception_str}")
-        if len(daemon_list.result) < 1:
-            raise Exception(f"Alert report: Can't find the Prometheus instance")
-
-        d = daemon_list.result[0]
-        host = d.ip if d.ip else d.hostname  # ip is of type str
-        port = str(d.ports[0]) if d.ports else ""  # ports is a list of ints
-        if not (host and port):
-            raise Exception(f"Can't get Prometheus IP and/or port from manager")
-
+        alerts_url = f"{get_prometheus_url(mgr)}/alerts"
         # Get the alerts
         resp = {}
         try:
-            resp = requests.get(f"http://{host}:{port}/api/v1/alerts").json()
+            resp = requests.get(alerts_url).json()
         except Exception as e:
-            raise Exception(f"Error getting alerts from Prometheus at {host}:{port} : {e}")
+            raise Exception(f"Error getting alerts from Prometheus at {alerts_url} : {e}")
 
         if 'data' not in resp or 'alerts' not in resp['data']:
             raise Exception(f"Prometheus returned a bad reply: {resp}")
@@ -1404,3 +1490,4 @@ class CallHomeAgent(MgrModule):
             return HandleCommandResult(stderr=str(ex))
         else:
             return HandleCommandResult(stdout=output)
+
diff --git a/src/pybind/mgr/call_home_agent/tests/test_agent.py b/src/pybind/mgr/call_home_agent/tests/test_agent.py
index 8bc46191f8f..640561ae3a2 100644
--- a/src/pybind/mgr/call_home_agent/tests/test_agent.py
+++ b/src/pybind/mgr/call_home_agent/tests/test_agent.py
@@ -4,7 +4,7 @@ import json
 
 from unittest.mock import MagicMock, Mock, patch
 
-from call_home_agent.module import Report
+from call_home_agent.module import Report, exec_prometheus_query
 
 TEST_JWT_TOKEN = r"eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJ0ZXN0IiwiaWF0IjoxNjkxNzUzNDM5LCJqdGkiOiIwMTIzNDU2Nzg5MDEyMzQ1Njc4OTAwMTIzNDU2Nzg5MCJ9.0F66k81_PmKoSd9erQoxnq73760SXs8WQTd3s8pqEFY\\"
 EXPECTED_JTI = '01234567890123456789001234567890'
@@ -197,5 +197,24 @@ class TestReport(unittest.TestCase):
         self.report.send(force=True)
         mock_post.assert_called()
 
+    @patch('requests.get')
+    def test_exec_prometheus_query(self, mock_get):
+        request_get_response = MagicMock(status_code=200, reason='pepe', text='{"status":"success","data":{"resultType":"vector","result":[{"metric":{"ceph_health":"HEALTH_OK"},"value":[1616414100,"1"]}]}}')
+        mock_get.return_value = request_get_response
+        result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
+        assert result['status'] ==  "success"
+
+        # Test metric error (server is ok, but something wrong executing the query):
+        request_get_response.raise_for_status = MagicMock(side_effect=Exception("Error in metrics"))
+        with self.assertRaises(Exception) as exception_context:
+            result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
+        self.assertRegex(str(exception_context.exception), "Error in metrics")
+
+        # Result metrics not returned because a Prometheus server problem
+        mock_get.side_effect=Exception("Server error")
+        with self.assertRaises(Exception) as exception_context:
+            result = exec_prometheus_query("http://prom/query/v1", "ceph_health")
+        self.assertRegex(str(exception_context.exception), "Server error")
+
     def tearDown(self):
         self.patcher.stop()
-- 
2.47.3