]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
telemetry/module.py: update gather_perf_counters to emit values of labeled counters 61997/head
authorNaveen Naidu <naveennaidu479@gmail.com>
Tue, 25 Feb 2025 17:36:26 +0000 (17:36 +0000)
committerNaveen Naidu <naveennaidu479@gmail.com>
Tue, 8 Apr 2025 02:26:46 +0000 (07:56 +0530)
Until now, gather_perf_counters only included the values of unlabeled
counters. We update the API to include the values of labeled counters.
This change also updates the format of how these values are now emitted.

The perf counters in the telemetry report are now shows as below:
```
"perf_counters": {
        "mon.239f8ba73d60451130f972fcd40d9b409b1bfb66": {
            "AsyncMessenger::Worker": [
                {
                    "counters": {
                        "msgr_connection_idle_timeouts": {
                            "value": 0
                        },
                        "msgr_connection_ready_timeouts": {
                            "value": 0
                        }
                    },
                    "labels": {
                        "id": "0"
                    }
                }
            ],
```

Notice that each counter now has two new subfields: 'counters' and
'labels'.

Signed-off-by: Naveen Naidu <naveen.naidu@ibm.com>
src/pybind/mgr/telemetry/module.py

index cd13dd79643191a3153784300c1ed5ce0fbbde9a..c9aa3b2792d0cff7d067678294bccc99af4579d7 100644 (file)
@@ -808,27 +808,36 @@ class Module(MgrModule):
         return crashlist
 
     def gather_perf_counters(self, mode: str = 'separated') -> Dict[str, dict]:
-        # Extract perf counter data with get_unlabeled_perf_counters(), a method
-        # from mgr/mgr_module.py. This method returns a nested dictionary that
-        # looks a lot like perf schema, except with some additional fields.
-        #
-        # Example of output, a snapshot of a mon daemon:
-        #   "mon.b": {
-        #       "bluestore.kv_flush_lat": {
-        #           "count": 2431,
-        #           "description": "Average kv_thread flush latency",
-        #           "nick": "fl_l",
-        #           "priority": 8,
-        #           "type": 5,
-        #           "units": 1,
-        #           "value": 88814109
-        #       },
-        #   },
-        perf_counters = self.get_unlabeled_perf_counters()
+        """
+        Extract perf counter data with get_perf_counters(), a method from
+        mgr/mgr_module.py. This method returns a nested dictionary that looks a
+        lot like perf schema, except with some additional fields.
+
+        Example of output, a snapshot of a mon daemon:
+            "mon.b":{
+                "bluestore": [
+                    {
+                        "labels": {},
+                        "counters": {
+                            "kv_flush_lat": {
+                                "description": "bluestore.kv_flush_lat",
+                                "nick": "kfsl",
+                                "type": 5,
+                                "priority": 8,
+                                "units": 1,
+                                "value": 14814406948,
+                                "count": 141
+                            },
+                        }
+                    },
+                ]
+            }
+
+        """
+        perf_counters = self.get_perf_counters()
 
         # Initialize 'result' dict
-        result: Dict[str, dict] = defaultdict(lambda: defaultdict(
-            lambda: defaultdict(lambda: defaultdict(int))))
+        result: Dict[str, dict] = defaultdict(lambda: defaultdict(list))
 
         # 'separated' mode
         anonymized_daemon_dict = {}
@@ -850,11 +859,7 @@ class Module(MgrModule):
                 else:
                     result[daemon_type]['num_combined_daemons'] += 1
 
-            for collection in perf_counters_by_daemon:
-                # Split the collection to avoid redundancy in final report; i.e.:
-                #   bluestore.kv_flush_lat, bluestore.kv_final_lat -->
-                #   bluestore: kv_flush_lat, kv_final_lat
-                col_0, col_1 = collection.split('.')
+            for collection, sub_collection_list in perf_counters_by_daemon.items():
 
                 # Debug log for empty keys. This initially was a problem for prioritycache
                 # perf counters, where the col_0 was empty for certain mon counters:
@@ -864,42 +869,52 @@ class Module(MgrModule):
                 #        "cache_bytes": {...},                          "cache_bytes": {...},
                 #
                 # This log is here to detect any future instances of a similar issue.
-                if (daemon == "") or (col_0 == "") or (col_1 == ""):
+                if (daemon == "") or (collection == ""):
                     self.log.debug("Instance of an empty key: {}{}".format(daemon, collection))
+                    continue
 
-                if mode == 'separated':
-                    # Add value to result
-                    result[daemon][col_0][col_1]['value'] = \
-                            perf_counters_by_daemon[collection]['value']
-
-                    # Check that 'count' exists, as not all counters have a count field.
-                    if 'count' in perf_counters_by_daemon[collection]:
-                        result[daemon][col_0][col_1]['count'] = \
-                                perf_counters_by_daemon[collection]['count']
-                elif mode == 'aggregated':
-                    # Not every rgw daemon has the same schema. Specifically, each rgw daemon
-                    # has a uniquely-named collection that starts off identically (i.e.
-                    # "objecter-0x...") then diverges (i.e. "...55f4e778e140.op_rmw").
-                    # This bit of code combines these unique counters all under one rgw instance.
-                    # Without this check, the schema would remain separeted out in the final report.
-                    if col_0[0:11] == "objecter-0x":
-                        col_0 = "objecter-0x"
-
-                    # Check that the value can be incremented. In some cases,
-                    # the files are of type 'pair' (real-integer-pair, integer-integer pair).
-                    # In those cases, the value is a dictionary, and not a number.
-                    #   i.e. throttle-msgr_dispatch_throttler-hbserver["wait"]
-                    if isinstance(perf_counters_by_daemon[collection]['value'], numbers.Number):
-                        result[daemon_type][col_0][col_1]['value'] += \
-                                perf_counters_by_daemon[collection]['value']
-
-                    # Check that 'count' exists, as not all counters have a count field.
-                    if 'count' in perf_counters_by_daemon[collection]:
-                        result[daemon_type][col_0][col_1]['count'] += \
-                                perf_counters_by_daemon[collection]['count']
-                else:
-                    self.log.error('Incorrect mode specified in gather_perf_counters: {}'.format(mode))
-                    return {}
+                result[daemon][collection] = []
+
+                for sub_collection in sub_collection_list:
+                    sub_collection_result: Dict[str, dict] = defaultdict(lambda: defaultdict(dict))
+                    sub_collection_result['labels'] = sub_collection['labels']
+                    for sub_collection_counter_name, sub_collection_counter_info in sub_collection['counters'].items():
+                        if mode == 'separated':
+                            # Add value to result
+                            sub_collection_result['counters'][sub_collection_counter_name]['value'] = \
+                                sub_collection_counter_info['value']
+
+                            # Check that 'count' exists, as not all counters have a count field.
+                            if 'count' in sub_collection_counter_info:
+                                sub_collection_result['counters'][sub_collection_counter_name]['count'] = \
+                                        sub_collection_counter_info['count']
+                        elif mode == 'aggregated':
+                            self.log.debug("telemetry in mode: agregated")
+                            # Not every rgw daemon has the same schema. Specifically, each rgw daemon
+                            # has a uniquely-named collection that starts off identically (i.e.
+                            # "objecter-0x...") then diverges (i.e. "...55f4e778e140.op_rmw").
+                            # This bit of code combines these unique counters all under one rgw instance.
+                            # Without this check, the schema would remain separeted out in the final report.
+                            if collection[0:11] == "objecter-0x":
+                                collection = "objecter-0x"
+
+                            # Check that the value can be incremented. In some cases,
+                            # the files are of type 'pair' (real-integer-pair, integer-integer pair).
+                            # In those cases, the value is a dictionary, and not a number.
+                            #   i.e. throttle-msgr_dispatch_throttler-hbserver["wait"]
+                            if isinstance(sub_collection_counter_info['value'], numbers.Number):
+                                sub_collection_result['counters'][sub_collection_counter_name]['value'] += \
+                                        sub_collection_counter_info['value']
+
+                            # Check that 'count' exists, as not all counters have a count field.
+                            if 'count' in sub_collection_counter_info:
+                                sub_collection_result['counters'][sub_collection_counter_name]['count'] += \
+                                        sub_collection_counter_info['count']
+                        else:
+                            self.log.error('Incorrect mode specified in gather_perf_counters: {}'.format(mode))
+                            return {}
+
+                    result[daemon][collection].append(sub_collection_result)
 
         if mode == 'separated':
             # for debugging purposes only, this data is never reported