From: Mykola Golub Date: Fri, 30 Nov 2018 14:52:17 +0000 (+0200) Subject: mgr/prometheus: provide RBD stats via osd dynamic perf counters X-Git-Tag: v14.1.0~649^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=ad174de0808981d733508b6dc2d528eb6cd96519;p=ceph-ci.git mgr/prometheus: provide RBD stats via osd dynamic perf counters Signed-off-by: Mykola Golub --- diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 4806202ac30..cb74a266828 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -7,6 +7,7 @@ import socket import threading import time from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES +from rbd import RBD # Defaults for the Prometheus HTTP server. Can also set in config-key # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations @@ -157,6 +158,8 @@ class Module(MgrModule): {'name': 'server_addr'}, {'name': 'server_port'}, {'name': 'scrape_interval'}, + {'name': 'rbd_stats_pools'}, + {'name': 'rbd_stats_pools_refresh_interval'}, ] def __init__(self, *args, **kwargs): @@ -167,6 +170,24 @@ class Module(MgrModule): self.collect_time = 0 self.collect_timeout = 5.0 self.collect_cache = None + self.rbd_stats = { + 'pools' : {}, + 'pools_refresh_time' : 0, + 'counters_info' : { + 'write_ops' : {'type' : self.PERFCOUNTER_COUNTER, + 'desc' : 'RBD image writes count'}, + 'read_ops' : {'type' : self.PERFCOUNTER_COUNTER, + 'desc' : 'RBD image reads count'}, + 'write_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG, + 'desc' : 'RBD image bytes written'}, + 'read_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG, + 'desc' : 'RBD image bytes read'}, + 'write_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG, + 'desc' : 'RBD image writes latency (msec)'}, + 'read_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG, + 'desc' : 'RBD image reads latency (msec)'}, + }, + } _global_instance['plugin'] = self def _setup_static_metrics(self): @@ -514,6 +535,173 @@ class Module(MgrModule): stat = 'num_objects_{}'.format(obj) self.metrics[stat].set(pg_sum[stat]) + def get_rbd_stats(self): + # Per RBD image stats is collected by registering a dynamic osd perf + # stats query that tells OSDs to group stats for requests associated + # with RBD objects by pool and image id, which are extracted from the + # request object names or other attributes. + # The RBD object names have the following prefixes: + # - rbd_data.{image_id}. (data stored in the same pool as metadata) + # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool) + # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled) + # The pool_id in the object name is the id of the pool with the image + # metdata, and should be used in the image spec. If there is no pool_id + # in the object name, the image pool is the pool where the object is + # located. + + pools = self.get_localized_config('rbd_stats_pools', '').split() + pools.sort() + + rbd_stats_pools = [] + for pool_id in list(self.rbd_stats['pools']): + name = self.rbd_stats['pools'][pool_id]['name'] + if name not in pools: + del self.rbd_stats['pools'][pool_id] + else: + rbd_stats_pools.append(name) + + pools_refreshed = False + if pools: + next_refresh = self.rbd_stats['pools_refresh_time'] + \ + self.get_localized_config('rbd_stats_pools_refresh_interval', + 300) + rbd_stats_pools.sort() + if rbd_stats_pools != pools or time.time() >= next_refresh: + self.refresh_rbd_stats_pools(pools) + pools_refreshed = True + + pool_ids = list(self.rbd_stats['pools']) + pool_ids.sort() + pool_id_regex = '|'.join(['^%s$' % x for x in pool_ids]) + + if 'query' in self.rbd_stats and \ + pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex']: + self.remove_osd_perf_query(self.rbd_stats['query_id']) + del self.rbd_stats['query_id'] + del self.rbd_stats['query'] + + if not self.rbd_stats['pools']: + return + + counters_info = self.rbd_stats['counters_info'] + + if 'query_id' not in self.rbd_stats: + query = { + 'key_descriptor': [ + {'type': 'pool_id', 'regex': pool_id_regex}, + {'type': 'object_name', + 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'}, + ], + 'performance_counter_descriptors': list(counters_info), + } + query_id = self.add_osd_perf_query(query) + if query_id is None: + self.log.error('failed to add query %s' % query) + return + self.rbd_stats['query'] = query + self.rbd_stats['query_id'] = query_id + + res = self.get_osd_perf_counters(self.rbd_stats['query_id']) + for c in res['counters']: + # if the pool id is not found in the object name use id of the + # pool where the object is located + if c['k'][1][1]: + pool_id = int(c['k'][1][1]) + else: + pool_id = int(c['k'][0][0]) + if pool_id not in self.rbd_stats['pools'] and not pools_refreshed: + self.refresh_rbd_stats_pools(pools) + pools_refreshed = True + if pool_id not in self.rbd_stats['pools']: + continue + image_id = c['k'][1][2] + pool = self.rbd_stats['pools'][pool_id] + if image_id not in pool['images'] and not pools_refreshed: + self.refresh_rbd_stats_pools(pools) + pools_refreshed = True + if image_id not in pool['images']: + continue + counters = pool['images'][image_id]['c'] + for i in range(len(c['c'])): + counters[i][0] += c['c'][i][0] + counters[i][1] += c['c'][i][1] + + for pool_id, pool in self.rbd_stats['pools'].items(): + pool_name = pool['name'] + for image_id in pool['images']: + image_name = pool['images'][image_id]['n'] + counters = pool['images'][image_id]['c'] + i = 0 + for key in counters_info: + counter_info = counters_info[key] + stattype = self._stattype_to_str(counter_info['type']) + if counter_info['type'] == self.PERFCOUNTER_COUNTER: + path = 'rbd_' + key + if path not in self.metrics: + self.metrics[path] = Metric( + stattype, + path, + counter_info['desc'], + ("pool", "image",), + ) + self.metrics[path].set(counters[i][0], + (pool_name, image_name,)) + elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG: + path = 'rbd_' + key + '_sum' + if path not in self.metrics: + self.metrics[path] = Metric( + stattype, + path, + counter_info['desc'] + ' Total', + ("pool", "image",), + ) + self.metrics[path].set(counters[i][0], + (pool_name, image_name,)) + path = 'rbd_' + key + '_count' + if path not in self.metrics: + self.metrics[path] = Metric( + 'counter', + path, + counter_info['desc'] + ' Count', + ("pool", "image",), + ) + self.metrics[path].set(counters[i][1], + (pool_name, image_name,)) + i += 1; + + def refresh_rbd_stats_pools(self, pools): + self.log.debug('refreshing rbd pools %s' % (pools)) + + counters_info = self.rbd_stats['counters_info'] + for pool_name in pools: + try: + pool_id = self.rados.pool_lookup(pool_name) + with self.rados.open_ioctx(pool_name) as ioctx: + if pool_id not in self.rbd_stats['pools']: + self.rbd_stats['pools'][pool_id] = {'images' : {}} + pool = self.rbd_stats['pools'][pool_id] + pool['name'] = pool_name + images = {} + for image_meta in RBD().list2(ioctx): + image = {'n' : image_meta['name']} + image_id = image_meta['id'] + if image_id in pool['images']: + image['c'] = pool['images'][image_id]['c'] + else: + image['c'] = [[0, 0] for x in counters_info] + images[image_id] = image + pool['images'] = images + except Exception as e: + self.log.error('failed listing pool %s: %s' % (pool_name, e)) + self.rbd_stats['pools_refresh_time'] = time.time() + + def shutdown_rbd_stats(self): + if 'query_id' in self.rbd_stats: + self.remove_osd_perf_query(self.rbd_stats['query_id']) + del self.rbd_stats['query_id'] + del self.rbd_stats['query'] + self.rbd_stats['pools'].clear() + def collect(self): # Clear the metrics before scraping for k in self.metrics.keys(): @@ -570,6 +758,8 @@ class Module(MgrModule): ) self.metrics[path].set(value, (daemon,)) + self.get_rbd_stats(); + # Return formatted metrics and clear no longer used data _metrics = [m.str_expfmt() for m in self.metrics.values()] for k in self.metrics.keys(): @@ -700,6 +890,7 @@ class Module(MgrModule): self.shutdown_event.clear() cherrypy.engine.stop() self.log.info('Engine stopped.') + self.shutdown_rbd_stats() def shutdown(self): self.log.info('Stopping engine...')