From 79781b4ea83ed91cb485cac4d8644fa10b8d357b Mon Sep 17 00:00:00 2001
From: Laura Flores <lflores@redhat.com>
Date: Wed, 20 Oct 2021 23:54:30 +0000
Subject: [PATCH] mgr/telemetry: provide separated and aggregated data for
 osd_perf_histograms

    Currently, data is being aggregated by daemon type in the Telemetry perf channel. For instance, all data for OSDs is combined under one general "osd" category.

    Upon further discussion, it has come to light that it may be more fruitful for us to keep the data separated (i.e. osd.0. osd.1, osd.2). This is because we believe it will be better to derive accurate conclusions about problems in performance if we are able to analyze the daemons separately. Providing an aggregated summary may not be enough to pinpoint the root of a performance problem.

    In this commit, I have provided an option to view the osd_perf_histograms data both ways: aggregated AND separated.

    After the perf channel undergoes a trial run on a long-running cluster, we will have the insight we need to determine which option to keep. The option we deem most appropriate, aggregated or separated, will ultimately be chosen to include in the report.

    (The change to the schema here has no effect on the dashboard formatting.)

Signed-off-by: Laura Flores <lflores@redhat.com>
---
 src/pybind/mgr/telemetry/module.py | 74 +++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/src/pybind/mgr/telemetry/module.py b/src/pybind/mgr/telemetry/module.py
index b4a551a4ebd..165291581a4 100644
--- a/src/pybind/mgr/telemetry/module.py
+++ b/src/pybind/mgr/telemetry/module.py
@@ -354,7 +354,7 @@ class Module(MgrModule):
 
         return result
 
-    def get_osd_histograms(self) -> List[Dict[str, dict]]:
+    def get_osd_histograms(self, mode: str = 'separated') -> List[Dict[str, dict]]:
         # Initialize result dict
         result: Dict[str, dict] = defaultdict(lambda: defaultdict(
                                               lambda: defaultdict(
@@ -364,7 +364,7 @@ class Module(MgrModule):
 
         # Get list of osd ids from the metadata
         osd_metadata = self.get('osd_metadata')
-        
+
         # Grab output from the "osd.x perf histogram dump" command
         for osd_id in osd_metadata:
             cmd_dict = {
@@ -422,8 +422,7 @@ class Module(MgrModule):
                         # At this point, you will see that the name of the key is the string
                         # form of our axes list (str(axes)). This is there so that histograms
                         # with different axis configs will not be combined.
-                        # The key names are modified into something more readable ('config_x')
-                        # down below.
+                        # These key names are later dropped when only the values are returned.
                         result[str(axes)][histogram]['axes'] = axes
 
                         # Collect current values and make sure they are in
@@ -432,21 +431,30 @@ class Module(MgrModule):
                         for value_list in dump['osd'][histogram]['values']:
                             values.append([int(v) for v in value_list])
 
-                        # Aggregate values. If 'values' have already been initialized,
-                        # we can safely add.
-                        if 'values' in result[str(axes)][histogram]:
-                            for i in range (0, len(values)):
-                                for j in range (0, len(values[i])):
-                                    values[i][j] += result[str(axes)][histogram]['values'][i][j]
-
-                        # Add the values to result.
-                        result[str(axes)][histogram]['values'] = values
-
-                        # Update num_combined_osds
-                        if 'num_combined_osds' not in result[str(axes)][histogram]:
-                            result[str(axes)][histogram]['num_combined_osds'] = 1
+                        if mode == 'separated':
+                            if 'osds' not in result[str(axes)][histogram]:
+                                result[str(axes)][histogram]['osds'] = []
+                            result[str(axes)][histogram]['osds'].append({'osd_id': int(osd_id), 'values': values})
+
+                        elif mode == 'aggregated':
+                            # Aggregate values. If 'values' have already been initialized,
+                            # we can safely add.
+                            if 'values' in result[str(axes)][histogram]:
+                                for i in range (0, len(values)):
+                                    for j in range (0, len(values[i])):
+                                        values[i][j] += result[str(axes)][histogram]['values'][i][j]
+
+                            # Add the values to result.
+                            result[str(axes)][histogram]['values'] = values
+
+                            # Update num_combined_osds
+                            if 'num_combined_osds' not in result[str(axes)][histogram]:
+                                result[str(axes)][histogram]['num_combined_osds'] = 1
+                            else:
+                                result[str(axes)][histogram]['num_combined_osds'] += 1
                         else:
-                            result[str(axes)][histogram]['num_combined_osds'] += 1
+                            self.log.error('Incorrect mode specified in get_osd_histograms: {}'.format(mode))
+                            return list()
 
                 # Sometimes, json errors occur if you give it an empty string.
                 # I am also putting in a catch for a KeyError since it could
@@ -913,7 +921,9 @@ class Module(MgrModule):
 
             report['stat_sum_per_pool'] = self.get_stat_sum_per_pool()
             report['io_rate'] = self.get_io_rate()
-            report['osd_perf_histograms'] = self.get_osd_histograms()
+
+            report['osd_perf_histograms_aggregated'] = self.get_osd_histograms('aggregated')
+            report['osd_perf_histograms_separated'] = self.get_osd_histograms('separated')
 
         # NOTE: We do not include the 'device' channel in this report; it is
         # sent to a different endpoint.
@@ -1042,15 +1052,23 @@ Please consider enabling the telemetry module with 'ceph telemetry on'.'''
         # ranges and values, which are currently in list form, into strings so that
         # they are displayed horizontally instead of vertically.
         try:
-            for config in report['osd_perf_histograms']:
-                for histogram in config:
-                    # Adjust ranges by converting lists into strings
-                    for axis in config[histogram]['axes']:
-                        for i in range(0, len(axis['ranges'])):
-                            axis['ranges'][i] = str(axis['ranges'][i])
-                    # Adjust values by converting lists into strings
-                    for i in range(0, len(config[histogram]['values'])):
-                        config[histogram]['values'][i] = str(config[histogram]['values'][i])
+            # Formatting ranges and values in osd_perf_histograms
+            modes_to_be_formatted = ['osd_perf_histograms_aggregated', 'osd_perf_histograms_separated']
+            for mode in modes_to_be_formatted:
+                for config in report[mode]:
+                    for histogram in config:
+                        # Adjust ranges by converting lists into strings
+                        for axis in config[histogram]['axes']:
+                            for i in range(0, len(axis['ranges'])):
+                                axis['ranges'][i] = str(axis['ranges'][i])
+                        # Adjust values by converting lists into strings
+                        if mode == 'osd_perf_histograms_aggregated':
+                            for i in range(0, len(config[histogram]['values'])):
+                                config[histogram]['values'][i] = str(config[histogram]['values'][i])
+                        else: # if mode == 'osd_perf_histograms_separated'
+                            for osd in config[histogram]['osds']:
+                                for i in range(0, len(osd['values'])):
+                                    osd['values'][i] = str(osd['values'][i])
         except KeyError:
             # If the perf channel is not enabled, there should be a KeyError since
             # 'osd_perf_histograms' would not be present in the report. In that case,
-- 
2.47.3