]> git-server-git.apps.pok.os.sepia.ceph.com Git - cephmetrics.git/commitdiff
mon: updated for logging and additional metrics collected for rbd and osd hosts
authorPaul Cuzner <pcuzner@redhat.com>
Mon, 26 Jun 2017 05:12:23 +0000 (17:12 +1200)
committerPaul Cuzner <pcuzner@redhat.com>
Mon, 26 Jun 2017 05:12:23 +0000 (17:12 +1200)
collector class now scans the cluster to determine the count of RBDs. Each monitor
will pick a discrete set of pools to scan, so the overall load is shared across monitors.

In addition, since the osd tree command is used to determine the up/down state of
the OSDs, the same output is used to determine the number of osd hosts in the
configuration. Prior to this change the determination was inferred through a
graphite query.

collectors/mon.py

index 778210e7b3552fe488e19f39979bf8af7f73c256..0593ff36fab596f7ef29b277b79912c1c87cde1b 100644 (file)
@@ -1,10 +1,31 @@
 #!/usr/bin/env python
 
 import rados
+import rbd
 import json
+import threading
+import time
 
 from collectors.base import BaseCollector
-from collectors.common import add_dicts, merge_dicts
+from collectors.common import add_dicts, merge_dicts, get_hostname
+
+class RBDScanner(threading.Thread):
+
+    def __init__(self, cluster_name, pool_name):
+        self.cluster_name = cluster_name
+        self.pool_name = pool_name
+        self.num_rbds = 0
+        threading.Thread.__init__(self)
+
+    def run(self):
+        rbd_images = []
+        conf_file = "/etc/ceph/{}.conf".format(self.cluster_name)
+        with rados.Rados(conffile=conf_file) as cluster:
+            with cluster.open_ioctx(self.pool_name) as ioctx:
+                rbd_inst = rbd.RBD()
+                rbd_images = rbd_inst.list(ioctx)
+
+        self.num_rbds = len(rbd_images)
 
 
 class Mon(BaseCollector):
@@ -26,6 +47,8 @@ class Mon(BaseCollector):
     cluster_metrics = {
         "num_mon": ("num_mon", "gauge"),
         "num_mon_quorum": ("num_mon_quorum", "gauge"),
+        "num_rbds": ("num_rbds", "gauge"),
+        "num_osd_hosts": ("num_osd_hosts", "gauge"),
         "num_osd": ("num_osd", "gauge"),
         "num_osd_up": ("num_osd_up", "gauge"),
         "num_osd_in": ("num_osd_in", "gauge"),
@@ -72,9 +95,14 @@ class Mon(BaseCollector):
         "status": ("status", "gauge")
     }
 
+    mon_states = {
+        "mon_status": ("mon_status", "gauge")
+    }
+
     all_metrics = merge_dicts(pool_recovery_metrics, pool_client_metrics)
     all_metrics = merge_dicts(all_metrics, cluster_metrics)
     all_metrics = merge_dicts(all_metrics, osd_metrics)
+    all_metrics = merge_dicts(all_metrics, mon_states)
 
     def _mon_command(self, cmd_request):
         """ Issue a command to the monitor """
@@ -82,17 +110,24 @@ class Mon(BaseCollector):
         buf_s = '{}'
         conf_file = "/etc/ceph/{}.conf".format(self.cluster_name)
 
+        start = time.time()
         with rados.Rados(conffile=conf_file) as cluster:
             cmd = {'prefix': cmd_request, 'format': 'json'}
             rc, buf_s, out = cluster.mon_command(json.dumps(cmd), b'')
+        end = time.time()
+
+        self.elapsed_log_msg("_mon_command call for {}".format(cmd_request),
+                             (end - start))
 
         return json.loads(buf_s)
 
     def _mon_health(self):
 
         cluster_data = self._admin_socket().get('cluster')
-        health_text = self._mon_command("health").get('overall_status',
-                                                      'UNKNOWN')
+        health_data = self._mon_command("health")
+        health_text = health_data.get('overall_status',
+                                      'UNKNOWN')
+
         health_num = Mon.health.get(health_text, 16)
 
         cluster = {Mon.cluster_metrics[k][0]: cluster_data[k]
@@ -100,6 +135,15 @@ class Mon(BaseCollector):
 
         cluster['health'] = health_num
 
+        services = health_data.get('health').get('health_services')
+        monstats = {}
+        for svc in services:
+            if 'mons' in svc:
+                monstats = { mon.get('name'): Mon.health.get(mon.get('health'))
+                             for mon in svc.get('mons')}
+
+        cluster['mon_status'] = monstats
+
         return cluster
 
     @classmethod
@@ -152,16 +196,87 @@ class Mon(BaseCollector):
 
         return pool_stats
 
-
     def _get_osd_states(self):
 
         raw = self._mon_command('osd tree')
-        osds = {str(osd.get('id')): {"status": Mon.osd_state.get(osd.get('status'))}
+        osds = {str(osd.get('id')): {"status":
+                Mon.osd_state.get(osd.get('status'))}
                 for osd in raw.get('nodes')
                 if osd.get('type') == 'osd'}
 
-        return osds
+        num_osd_hosts = len([node.get('name') for node in raw.get('nodes')
+                             if node.get('type') == 'host'])
+
+        return num_osd_hosts, osds
+
+    @staticmethod
+    def _select_pools(pools, mons):
+        """
+        determine the pools this mon should scan based on it's name. We select
+        pools from the an offset into the pool list, and then repeat at an
+        interval set by # mons in the configuration. This splits up the pools
+        we have, so each mon looks at a discrete set of pools instead of all
+        mons performing all scans.
+        :param pools: (list) rados pool names
+        :param mons: (list) monitor names from ceph health
+        :return: (list) of pools this monitor should scan. empty list if the
+                 monitor name mismatches - so no scans done
+        """
+
+        pools_to_scan = []
+
+        try:
+            freq = mons.index(get_hostname())
+        except ValueError:
+            # this host's name is not in the monitor list?
+            # twilight zone moment
+            pass
+        else:
 
+            pools_to_scan = [pools[ptr]
+                             for ptr in xrange(freq, len(pools), len(mons))]
+
+        return pools_to_scan
+
+    def get_pools(self):
+
+        start = time.time()
+        conf_file = "/etc/ceph/{}.conf".format(self.cluster_name)
+        with rados.Rados(conffile=conf_file) as cluster:
+            rados_pools = sorted(cluster.list_pools())
+        end = time.time()
+
+        self.logger.debug('lspools took {0:.2f} secs'.format(end - start))
+
+        return rados_pools
+
+    def _get_rbds(self, monitors):
+
+        pool_list = self.get_pools()
+        mon_list = sorted(monitors.keys())
+        my_pools = Mon._select_pools(pool_list, mon_list)
+        threads = []
+
+        start = time.time()
+
+        for pool in my_pools:
+            thread = RBDScanner(self.cluster_name, pool)
+            thread.start()
+            threads.append(thread)
+
+        # wait for all threads
+        for thread in threads:
+            thread.join()
+
+        end = time.time()
+        self.elapsed_log_msg("rbd scans", (end - start))
+
+        total_rbds = sum([thread.num_rbds for thread in threads])
+
+        for thread in threads:
+            del thread
+
+        return total_rbds
 
     def get_stats(self):
         """
@@ -169,11 +284,19 @@ class Mon(BaseCollector):
         :return:
         """
 
+        start = time.time()
+
         pool_stats = self._get_pool_stats()
-        osd_states = self._get_osd_states()
+        num_osd_hosts, osd_states = self._get_osd_states()
         cluster_state = self._mon_health()
+        cluster_state['num_osd_hosts'] = num_osd_hosts
+        cluster_state['num_rbds'] = self._get_rbds(cluster_state['mon_status'])
+
         all_stats = merge_dicts(cluster_state, {"pools": pool_stats,
                                                 "osd_state": osd_states})
 
+        end = time.time()
+        self.elapsed_log_msg("mon get_stats call", (end - start))
+
         return {"mon": all_stats}