]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/prometheus: provide RBD stats via osd dynamic perf counters 25358/head
authorMykola Golub <mgolub@suse.com>
Fri, 30 Nov 2018 14:52:17 +0000 (16:52 +0200)
committerMykola Golub <mgolub@suse.com>
Mon, 10 Dec 2018 11:56:22 +0000 (11:56 +0000)
Signed-off-by: Mykola Golub <mgolub@suse.com>
src/pybind/mgr/prometheus/module.py

index 4806202ac302baf189fe16582eeb9ccf53c5fbf9..cb74a26682815b91aa69e21db4a71136fb68d233 100644 (file)
@@ -7,6 +7,7 @@ import socket
 import threading
 import time
 from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
+from rbd import RBD
 
 # Defaults for the Prometheus HTTP server.  Can also set in config-key
 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
@@ -157,6 +158,8 @@ class Module(MgrModule):
             {'name': 'server_addr'},
             {'name': 'server_port'},
             {'name': 'scrape_interval'},
+            {'name': 'rbd_stats_pools'},
+            {'name': 'rbd_stats_pools_refresh_interval'},
     ]
 
     def __init__(self, *args, **kwargs):
@@ -167,6 +170,24 @@ class Module(MgrModule):
         self.collect_time = 0
         self.collect_timeout = 5.0
         self.collect_cache = None
+        self.rbd_stats = {
+            'pools' : {},
+            'pools_refresh_time' : 0,
+            'counters_info' : {
+                'write_ops' : {'type' : self.PERFCOUNTER_COUNTER,
+                               'desc' : 'RBD image writes count'},
+                'read_ops' : {'type' : self.PERFCOUNTER_COUNTER,
+                              'desc' : 'RBD image reads count'},
+                'write_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+                                 'desc' : 'RBD image bytes written'},
+                'read_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+                                'desc' : 'RBD image bytes read'},
+                'write_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+                                   'desc' : 'RBD image writes latency (msec)'},
+                'read_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+                                  'desc' : 'RBD image reads latency (msec)'},
+            },
+        }
         _global_instance['plugin'] = self
 
     def _setup_static_metrics(self):
@@ -514,6 +535,173 @@ class Module(MgrModule):
             stat = 'num_objects_{}'.format(obj)
             self.metrics[stat].set(pg_sum[stat])
 
+    def get_rbd_stats(self):
+        # Per RBD image stats is collected by registering a dynamic osd perf
+        # stats query that tells OSDs to group stats for requests associated
+        # with RBD objects by pool and image id, which are extracted from the
+        # request object names or other attributes.
+        # The RBD object names have the following prefixes:
+        #   - rbd_data.{image_id}. (data stored in the same pool as metadata)
+        #   - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
+        #   - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
+        # The pool_id in the object name is the id of the pool with the image
+        # metdata, and should be used in the image spec. If there is no pool_id
+        # in the object name, the image pool is the pool where the object is
+        # located.
+
+        pools = self.get_localized_config('rbd_stats_pools', '').split()
+        pools.sort()
+
+        rbd_stats_pools = []
+        for pool_id in list(self.rbd_stats['pools']):
+            name = self.rbd_stats['pools'][pool_id]['name']
+            if name not in pools:
+                del self.rbd_stats['pools'][pool_id]
+            else:
+                rbd_stats_pools.append(name)
+
+        pools_refreshed = False
+        if pools:
+            next_refresh = self.rbd_stats['pools_refresh_time'] + \
+                self.get_localized_config('rbd_stats_pools_refresh_interval',
+                                          300)
+            rbd_stats_pools.sort()
+            if rbd_stats_pools != pools or time.time() >= next_refresh:
+                self.refresh_rbd_stats_pools(pools)
+                pools_refreshed = True
+
+        pool_ids = list(self.rbd_stats['pools'])
+        pool_ids.sort()
+        pool_id_regex = '|'.join(['^%s$' % x for x in pool_ids])
+
+        if 'query' in self.rbd_stats and \
+           pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex']:
+            self.remove_osd_perf_query(self.rbd_stats['query_id'])
+            del self.rbd_stats['query_id']
+            del self.rbd_stats['query']
+
+        if not self.rbd_stats['pools']:
+            return
+
+        counters_info = self.rbd_stats['counters_info']
+
+        if 'query_id' not in self.rbd_stats:
+            query = {
+                'key_descriptor': [
+                    {'type': 'pool_id', 'regex': pool_id_regex},
+                    {'type': 'object_name',
+                     'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
+                ],
+                'performance_counter_descriptors': list(counters_info),
+            }
+            query_id = self.add_osd_perf_query(query)
+            if query_id is None:
+                self.log.error('failed to add query %s' % query)
+                return
+            self.rbd_stats['query'] = query
+            self.rbd_stats['query_id'] = query_id
+
+        res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
+        for c in res['counters']:
+            # if the pool id is not found in the object name use id of the
+            # pool where the object is located
+            if c['k'][1][1]:
+                pool_id = int(c['k'][1][1])
+            else:
+                pool_id = int(c['k'][0][0])
+            if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
+                self.refresh_rbd_stats_pools(pools)
+                pools_refreshed = True
+            if pool_id not in self.rbd_stats['pools']:
+                continue
+            image_id = c['k'][1][2]
+            pool = self.rbd_stats['pools'][pool_id]
+            if image_id not in pool['images'] and not pools_refreshed:
+                self.refresh_rbd_stats_pools(pools)
+                pools_refreshed = True
+            if image_id not in pool['images']:
+                continue
+            counters = pool['images'][image_id]['c']
+            for i in range(len(c['c'])):
+                counters[i][0] += c['c'][i][0]
+                counters[i][1] += c['c'][i][1]
+
+        for pool_id, pool in self.rbd_stats['pools'].items():
+            pool_name = pool['name']
+            for image_id in pool['images']:
+                image_name = pool['images'][image_id]['n']
+                counters = pool['images'][image_id]['c']
+                i = 0
+                for key in counters_info:
+                    counter_info = counters_info[key]
+                    stattype = self._stattype_to_str(counter_info['type'])
+                    if counter_info['type'] == self.PERFCOUNTER_COUNTER:
+                        path = 'rbd_' + key
+                        if path not in self.metrics:
+                            self.metrics[path] = Metric(
+                                stattype,
+                                path,
+                                counter_info['desc'],
+                                ("pool", "image",),
+                            )
+                        self.metrics[path].set(counters[i][0],
+                                               (pool_name, image_name,))
+                    elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
+                        path = 'rbd_' + key + '_sum'
+                        if path not in self.metrics:
+                            self.metrics[path] = Metric(
+                                stattype,
+                                path,
+                                counter_info['desc'] + ' Total',
+                                ("pool", "image",),
+                            )
+                        self.metrics[path].set(counters[i][0],
+                                               (pool_name, image_name,))
+                        path = 'rbd_' + key + '_count'
+                        if path not in self.metrics:
+                            self.metrics[path] = Metric(
+                                'counter',
+                                path,
+                                counter_info['desc'] + ' Count',
+                                ("pool", "image",),
+                            )
+                        self.metrics[path].set(counters[i][1],
+                                               (pool_name, image_name,))
+                    i += 1;
+
+    def refresh_rbd_stats_pools(self, pools):
+        self.log.debug('refreshing rbd pools %s' % (pools))
+
+        counters_info = self.rbd_stats['counters_info']
+        for pool_name in pools:
+            try:
+                pool_id = self.rados.pool_lookup(pool_name)
+                with self.rados.open_ioctx(pool_name) as ioctx:
+                    if pool_id not in self.rbd_stats['pools']:
+                        self.rbd_stats['pools'][pool_id] = {'images' : {}}
+                    pool = self.rbd_stats['pools'][pool_id]
+                    pool['name'] = pool_name
+                    images = {}
+                    for image_meta in RBD().list2(ioctx):
+                        image = {'n' : image_meta['name']}
+                        image_id = image_meta['id']
+                        if image_id in pool['images']:
+                            image['c'] = pool['images'][image_id]['c']
+                        else:
+                            image['c'] = [[0, 0] for x in counters_info]
+                        images[image_id] = image
+                    pool['images'] = images
+            except Exception as e:
+                self.log.error('failed listing pool %s: %s' % (pool_name, e))
+        self.rbd_stats['pools_refresh_time'] = time.time()
+
+    def shutdown_rbd_stats(self):
+        if 'query_id' in self.rbd_stats:
+            self.remove_osd_perf_query(self.rbd_stats['query_id'])
+            del self.rbd_stats['query_id']
+            del self.rbd_stats['query']
+        self.rbd_stats['pools'].clear()
+
     def collect(self):
         # Clear the metrics before scraping
         for k in self.metrics.keys():
@@ -570,6 +758,8 @@ class Module(MgrModule):
                         )
                     self.metrics[path].set(value, (daemon,))
 
+        self.get_rbd_stats();
+
         # Return formatted metrics and clear no longer used data
         _metrics = [m.str_expfmt() for m in self.metrics.values()]
         for k in self.metrics.keys():
@@ -700,6 +890,7 @@ class Module(MgrModule):
         self.shutdown_event.clear()
         cherrypy.engine.stop()
         self.log.info('Engine stopped.')
+        self.shutdown_rbd_stats()
 
     def shutdown(self):
         self.log.info('Stopping engine...')