From e827702b7fd2f0586fc303e003b2bf5f33351575 Mon Sep 17 00:00:00 2001
From: Paul Cuzner <pcuzner@redhat.com>
Date: Wed, 23 Aug 2017 14:56:37 +1200
Subject: [PATCH] mon: account for null dict from _admin_socket

the _admin_socket method could return a null dict if the
socket is not there (i.e. ceph-mon is down). By checking for the
empty dict, the collector can remain active while ceph-mon is
stopped and restarted during normal maintenance processes on a
host.
---
 collectors/mon.py | 109 +++++++++++++++++++++++++++-------------------
 1 file changed, 64 insertions(+), 45 deletions(-)

diff --git a/collectors/mon.py b/collectors/mon.py
index ed36e7c..1b608c1 100644
--- a/collectors/mon.py
+++ b/collectors/mon.py
@@ -156,6 +156,10 @@ class Mon(BaseCollector):
     def __init__(self, *args, **kwargs):
         BaseCollector.__init__(self, *args, **kwargs)
 
+        self.admin_socket = ('/var/run/ceph/{}-mon.'
+                             '{}.asok'.format(self.cluster_name,
+                                              get_hostname()))
+
         self.last_state = CephState()
 
         self.ip_names = get_names()
@@ -253,19 +257,20 @@ class Mon(BaseCollector):
 
         cluster, health_data = self._mon_health_common()
 
-        mon_status_output = self._mon_command('mon_status')
-        quorum_list = mon_status_output.get('quorum')
-        mon_list = mon_status_output.get('monmap').get('mons')
-        mon_status = {}
-        for mon in mon_list:
-            state = 0 if mon.get('rank') in quorum_list else 4
-            mon_status[mon.get('name')] = state
+        if cluster:
+            mon_status_output = self._mon_command('mon_status')
+            quorum_list = mon_status_output.get('quorum')
+            mon_list = mon_status_output.get('monmap').get('mons')
+            mon_status = {}
+            for mon in mon_list:
+                state = 0 if mon.get('rank') in quorum_list else 4
+                mon_status[mon.get('name')] = state
 
-        cluster['mon_status'] = mon_status
+            cluster['mon_status'] = mon_status
 
-        self.manage_event(health_data.get('status'),
-                          health_data.get('summary', []),
-                          mon_status)
+            self.manage_event(health_data.get('status'),
+                              health_data.get('summary', []),
+                              mon_status)
 
         return cluster
 
@@ -275,47 +280,51 @@ class Mon(BaseCollector):
         # ceph.conf "mon_health_preluminous_compat=true"
         # this will provide the same output as pre-luminous
 
-        cluster_data = self._admin_socket().get('cluster')
-        pg_data = self._mon_command("pg stat")
-        health_data = self._mon_command("health")
-        health_text = health_data.get('status',
-                                      health_data.get('overall_status', ''))
+        health_data = {}
+        cluster = {}
+
+        cluster_data = self._admin_socket().get('cluster', {})
+        if cluster_data:
+            pg_data = self._mon_command("pg stat")
+            health_data = self._mon_command("health")
+            health_text = health_data.get('status',
+                                          health_data.get('overall_status', ''))
 
-        cluster = {Mon.cluster_metrics[k][0]: cluster_data[k]
-                   for k in cluster_data}
+            cluster = {Mon.cluster_metrics[k][0]: cluster_data[k]
+                       for k in cluster_data}
 
-        health_num = Mon.health.get(health_text, 16)
+            health_num = Mon.health.get(health_text, 16)
 
-        cluster['health'] = health_num
+            cluster['health'] = health_num
 
-        pg_states = pg_data.get('num_pg_by_state')  # list of dict name,num
-        health_summary = health_data.get('summary', [])  # list of issues
-        cluster['num_pgs_stuck'] = Mon.check_stuck_pgs(health_summary)
-        cluster['features'] = Mon.get_feature_state(health_summary,
-                                                    pg_states)
+            pg_states = pg_data.get('num_pg_by_state')  # list of dict name,num
+            health_summary = health_data.get('summary', [])  # list of issues
+            cluster['num_pgs_stuck'] = Mon.check_stuck_pgs(health_summary)
+            cluster['features'] = Mon.get_feature_state(health_summary,
+                                                        pg_states)
 
-        self.logger.debug(
-            'Features:{}'.format(json.dumps(cluster['features'])))
+            self.logger.debug(
+                'Features:{}'.format(json.dumps(cluster['features'])))
 
         return cluster, health_data
 
     def _mon_health(self):
 
         cluster, health_data = self._mon_health_common()
+        if cluster:
+            services = health_data.get('health').get('health_services')
+            mon_status = {}
+            for svc in services:
+                if 'mons' in svc:
+                    # Each monitor will have a numeric value denoting health
+                    mon_status = {mon.get('name'): Mon.health.get(mon.get('health'))
+                                  for mon in svc.get('mons')}
 
-        services = health_data.get('health').get('health_services')
-        mon_status = {}
-        for svc in services:
-            if 'mons' in svc:
-                # Each monitor will have a numeric value denoting health
-                mon_status = {mon.get('name'): Mon.health.get(mon.get('health'))
-                              for mon in svc.get('mons')}
+            cluster['mon_status'] = mon_status
 
-        cluster['mon_status'] = mon_status
-
-        self.manage_event(health_data.get('overall_status'),
-                          health_data.get('summary', []),
-                          mon_status)
+            self.manage_event(health_data.get('overall_status'),
+                              health_data.get('summary', []),
+                              mon_status)
 
         return cluster
 
@@ -558,14 +567,24 @@ class Mon(BaseCollector):
 
         start = time.time()
 
-        pool_stats = self._get_pool_stats()
-        num_osd_hosts, osd_states = self._get_osd_states()
         cluster_state = self.get_mon_health()
-        cluster_state['num_osd_hosts'] = num_osd_hosts
-        cluster_state['num_rbds'] = self._get_rbds(cluster_state['mon_status'])
+        if cluster_state:
+
+            pool_stats = self._get_pool_stats()
+            num_osd_hosts, osd_states = self._get_osd_states()
+
+            cluster_state['num_osd_hosts'] = num_osd_hosts
+            cluster_state['num_rbds'] = self._get_rbds(cluster_state['mon_status'])
+
+            all_stats = merge_dicts(cluster_state, {"pools": pool_stats,
+                                                    "osd_state": osd_states})
+        else:
+            all_stats = {}
+            self.error = True
+            msg = 'MON socket is not available...is ceph-mon active?'
+            self.error_msgs = [msg]
+            self.logger.warning(msg)
 
-        all_stats = merge_dicts(cluster_state, {"pools": pool_stats,
-                                                "osd_state": osd_states})
         all_stats['ceph_version'] = self.version
 
         end = time.time()
-- 
2.47.3