import threading
import time
from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
+from rbd import RBD
# Defaults for the Prometheus HTTP server. Can also set in config-key
# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
{'name': 'server_addr'},
{'name': 'server_port'},
{'name': 'scrape_interval'},
+ {'name': 'rbd_stats_pools'},
+ {'name': 'rbd_stats_pools_refresh_interval'},
]
def __init__(self, *args, **kwargs):
self.collect_time = 0
self.collect_timeout = 5.0
self.collect_cache = None
+ self.rbd_stats = {
+ 'pools' : {},
+ 'pools_refresh_time' : 0,
+ 'counters_info' : {
+ 'write_ops' : {'type' : self.PERFCOUNTER_COUNTER,
+ 'desc' : 'RBD image writes count'},
+ 'read_ops' : {'type' : self.PERFCOUNTER_COUNTER,
+ 'desc' : 'RBD image reads count'},
+ 'write_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+ 'desc' : 'RBD image bytes written'},
+ 'read_bytes' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+ 'desc' : 'RBD image bytes read'},
+ 'write_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+ 'desc' : 'RBD image writes latency (msec)'},
+ 'read_latency' : {'type' : self.PERFCOUNTER_LONGRUNAVG,
+ 'desc' : 'RBD image reads latency (msec)'},
+ },
+ }
_global_instance['plugin'] = self
def _setup_static_metrics(self):
stat = 'num_objects_{}'.format(obj)
self.metrics[stat].set(pg_sum[stat])
+ def get_rbd_stats(self):
+ # Per RBD image stats is collected by registering a dynamic osd perf
+ # stats query that tells OSDs to group stats for requests associated
+ # with RBD objects by pool and image id, which are extracted from the
+ # request object names or other attributes.
+ # The RBD object names have the following prefixes:
+ # - rbd_data.{image_id}. (data stored in the same pool as metadata)
+ # - rbd_data.{pool_id}.{image_id}. (data stored in a dedicated data pool)
+ # - journal_data.{pool_id}.{image_id}. (journal if journaling is enabled)
+ # The pool_id in the object name is the id of the pool with the image
+ # metdata, and should be used in the image spec. If there is no pool_id
+ # in the object name, the image pool is the pool where the object is
+ # located.
+
+ pools = self.get_localized_config('rbd_stats_pools', '').split()
+ pools.sort()
+
+ rbd_stats_pools = []
+ for pool_id in list(self.rbd_stats['pools']):
+ name = self.rbd_stats['pools'][pool_id]['name']
+ if name not in pools:
+ del self.rbd_stats['pools'][pool_id]
+ else:
+ rbd_stats_pools.append(name)
+
+ pools_refreshed = False
+ if pools:
+ next_refresh = self.rbd_stats['pools_refresh_time'] + \
+ self.get_localized_config('rbd_stats_pools_refresh_interval',
+ 300)
+ rbd_stats_pools.sort()
+ if rbd_stats_pools != pools or time.time() >= next_refresh:
+ self.refresh_rbd_stats_pools(pools)
+ pools_refreshed = True
+
+ pool_ids = list(self.rbd_stats['pools'])
+ pool_ids.sort()
+ pool_id_regex = '|'.join(['^%s$' % x for x in pool_ids])
+
+ if 'query' in self.rbd_stats and \
+ pool_id_regex != self.rbd_stats['query']['key_descriptor'][0]['regex']:
+ self.remove_osd_perf_query(self.rbd_stats['query_id'])
+ del self.rbd_stats['query_id']
+ del self.rbd_stats['query']
+
+ if not self.rbd_stats['pools']:
+ return
+
+ counters_info = self.rbd_stats['counters_info']
+
+ if 'query_id' not in self.rbd_stats:
+ query = {
+ 'key_descriptor': [
+ {'type': 'pool_id', 'regex': pool_id_regex},
+ {'type': 'object_name',
+ 'regex': '^(?:rbd|journal)_data\.(?:([0-9]+)\.)?([^.]+)\.'},
+ ],
+ 'performance_counter_descriptors': list(counters_info),
+ }
+ query_id = self.add_osd_perf_query(query)
+ if query_id is None:
+ self.log.error('failed to add query %s' % query)
+ return
+ self.rbd_stats['query'] = query
+ self.rbd_stats['query_id'] = query_id
+
+ res = self.get_osd_perf_counters(self.rbd_stats['query_id'])
+ for c in res['counters']:
+ # if the pool id is not found in the object name use id of the
+ # pool where the object is located
+ if c['k'][1][1]:
+ pool_id = int(c['k'][1][1])
+ else:
+ pool_id = int(c['k'][0][0])
+ if pool_id not in self.rbd_stats['pools'] and not pools_refreshed:
+ self.refresh_rbd_stats_pools(pools)
+ pools_refreshed = True
+ if pool_id not in self.rbd_stats['pools']:
+ continue
+ image_id = c['k'][1][2]
+ pool = self.rbd_stats['pools'][pool_id]
+ if image_id not in pool['images'] and not pools_refreshed:
+ self.refresh_rbd_stats_pools(pools)
+ pools_refreshed = True
+ if image_id not in pool['images']:
+ continue
+ counters = pool['images'][image_id]['c']
+ for i in range(len(c['c'])):
+ counters[i][0] += c['c'][i][0]
+ counters[i][1] += c['c'][i][1]
+
+ for pool_id, pool in self.rbd_stats['pools'].items():
+ pool_name = pool['name']
+ for image_id in pool['images']:
+ image_name = pool['images'][image_id]['n']
+ counters = pool['images'][image_id]['c']
+ i = 0
+ for key in counters_info:
+ counter_info = counters_info[key]
+ stattype = self._stattype_to_str(counter_info['type'])
+ if counter_info['type'] == self.PERFCOUNTER_COUNTER:
+ path = 'rbd_' + key
+ if path not in self.metrics:
+ self.metrics[path] = Metric(
+ stattype,
+ path,
+ counter_info['desc'],
+ ("pool", "image",),
+ )
+ self.metrics[path].set(counters[i][0],
+ (pool_name, image_name,))
+ elif counter_info['type'] == self.PERFCOUNTER_LONGRUNAVG:
+ path = 'rbd_' + key + '_sum'
+ if path not in self.metrics:
+ self.metrics[path] = Metric(
+ stattype,
+ path,
+ counter_info['desc'] + ' Total',
+ ("pool", "image",),
+ )
+ self.metrics[path].set(counters[i][0],
+ (pool_name, image_name,))
+ path = 'rbd_' + key + '_count'
+ if path not in self.metrics:
+ self.metrics[path] = Metric(
+ 'counter',
+ path,
+ counter_info['desc'] + ' Count',
+ ("pool", "image",),
+ )
+ self.metrics[path].set(counters[i][1],
+ (pool_name, image_name,))
+ i += 1;
+
+ def refresh_rbd_stats_pools(self, pools):
+ self.log.debug('refreshing rbd pools %s' % (pools))
+
+ counters_info = self.rbd_stats['counters_info']
+ for pool_name in pools:
+ try:
+ pool_id = self.rados.pool_lookup(pool_name)
+ with self.rados.open_ioctx(pool_name) as ioctx:
+ if pool_id not in self.rbd_stats['pools']:
+ self.rbd_stats['pools'][pool_id] = {'images' : {}}
+ pool = self.rbd_stats['pools'][pool_id]
+ pool['name'] = pool_name
+ images = {}
+ for image_meta in RBD().list2(ioctx):
+ image = {'n' : image_meta['name']}
+ image_id = image_meta['id']
+ if image_id in pool['images']:
+ image['c'] = pool['images'][image_id]['c']
+ else:
+ image['c'] = [[0, 0] for x in counters_info]
+ images[image_id] = image
+ pool['images'] = images
+ except Exception as e:
+ self.log.error('failed listing pool %s: %s' % (pool_name, e))
+ self.rbd_stats['pools_refresh_time'] = time.time()
+
+ def shutdown_rbd_stats(self):
+ if 'query_id' in self.rbd_stats:
+ self.remove_osd_perf_query(self.rbd_stats['query_id'])
+ del self.rbd_stats['query_id']
+ del self.rbd_stats['query']
+ self.rbd_stats['pools'].clear()
+
def collect(self):
# Clear the metrics before scraping
for k in self.metrics.keys():
)
self.metrics[path].set(value, (daemon,))
+ self.get_rbd_stats();
+
# Return formatted metrics and clear no longer used data
_metrics = [m.str_expfmt() for m in self.metrics.values()]
for k in self.metrics.keys():
self.shutdown_event.clear()
cherrypy.engine.stop()
self.log.info('Engine stopped.')
+ self.shutdown_rbd_stats()
def shutdown(self):
self.log.info('Stopping engine...')