From: Paul Cuzner Date: Wed, 23 Aug 2017 21:14:44 +0000 (+1200) Subject: mon: simplify the admin_socket read logic X-Git-Tag: v1.0~28^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F108%2Fhead;p=cephmetrics.git mon: simplify the admin_socket read logic The initial commit placed logic in each area that called the admin socket. This patch separates the admin socket call out to a separate method, so it gets checked in one place. Some tidy up and comments added too. --- diff --git a/collectors/mon.py b/collectors/mon.py index 1b608c1..4dd2f38 100644 --- a/collectors/mon.py +++ b/collectors/mon.py @@ -253,28 +253,27 @@ class Mon(BaseCollector): return stuck_pgs - def _mon_health_new(self): + def _mon_health_new(self, cluster_data): - cluster, health_data = self._mon_health_common() + cluster, health_data = self._mon_health_common(cluster_data) - if cluster: - mon_status_output = self._mon_command('mon_status') - quorum_list = mon_status_output.get('quorum') - mon_list = mon_status_output.get('monmap').get('mons') - mon_status = {} - for mon in mon_list: - state = 0 if mon.get('rank') in quorum_list else 4 - mon_status[mon.get('name')] = state + mon_status_output = self._mon_command('mon_status') + quorum_list = mon_status_output.get('quorum') + mon_list = mon_status_output.get('monmap').get('mons') + mon_status = {} + for mon in mon_list: + state = 0 if mon.get('rank') in quorum_list else 4 + mon_status[mon.get('name')] = state - cluster['mon_status'] = mon_status + cluster['mon_status'] = mon_status - self.manage_event(health_data.get('status'), - health_data.get('summary', []), - mon_status) + self.manage_event(health_data.get('status'), + health_data.get('summary', []), + mon_status) return cluster - def _mon_health_common(self): + def _mon_health_common(self, cluster_data): # for v12 (Luminous and beyond) add the following setting to # ceph.conf "mon_health_preluminous_compat=true" @@ -283,48 +282,49 @@ class Mon(BaseCollector): health_data = {} cluster = {} - cluster_data = self._admin_socket().get('cluster', {}) - if cluster_data: - pg_data = self._mon_command("pg stat") - health_data = self._mon_command("health") - health_text = health_data.get('status', - health_data.get('overall_status', '')) + pg_data = self._mon_command("pg stat") + health_data = self._mon_command("health") + health_text = health_data.get('status', + health_data.get('overall_status', '')) - cluster = {Mon.cluster_metrics[k][0]: cluster_data[k] - for k in cluster_data} + cluster = {Mon.cluster_metrics[k][0]: cluster_data[k] + for k in cluster_data} - health_num = Mon.health.get(health_text, 16) + health_num = Mon.health.get(health_text, 16) - cluster['health'] = health_num + cluster['health'] = health_num - pg_states = pg_data.get('num_pg_by_state') # list of dict name,num - health_summary = health_data.get('summary', []) # list of issues - cluster['num_pgs_stuck'] = Mon.check_stuck_pgs(health_summary) - cluster['features'] = Mon.get_feature_state(health_summary, - pg_states) + pg_states = pg_data.get('num_pg_by_state') # list of dict name,num + health_summary = health_data.get('summary', []) # list of issues + cluster['num_pgs_stuck'] = Mon.check_stuck_pgs(health_summary) + cluster['features'] = Mon.get_feature_state(health_summary, + pg_states) - self.logger.debug( - 'Features:{}'.format(json.dumps(cluster['features']))) + self.logger.debug( + 'Features:{}'.format(json.dumps(cluster['features']))) return cluster, health_data - def _mon_health(self): + def get_cluster_state(self): + return self._admin_socket().get('cluster', {}) + + def _mon_health(self, cluster_data): + + cluster, health_data = self._mon_health_common(cluster_data) - cluster, health_data = self._mon_health_common() - if cluster: - services = health_data.get('health').get('health_services') - mon_status = {} - for svc in services: - if 'mons' in svc: - # Each monitor will have a numeric value denoting health - mon_status = {mon.get('name'): Mon.health.get(mon.get('health')) - for mon in svc.get('mons')} + services = health_data.get('health').get('health_services') + mon_status = {} + for svc in services: + if 'mons' in svc: + # Each monitor will have a numeric value denoting health + mon_status = {mon.get('name'): Mon.health.get(mon.get('health')) + for mon in svc.get('mons')} - cluster['mon_status'] = mon_status + cluster['mon_status'] = mon_status - self.manage_event(health_data.get('overall_status'), - health_data.get('summary', []), - mon_status) + self.manage_event(health_data.get('overall_status'), + health_data.get('summary', []), + mon_status) return cluster @@ -410,6 +410,8 @@ class Mon(BaseCollector): '"data":"{}"}}'.format(tag_name, event_message)) except requests.ConnectionError: + # if we hit this, the endpoint wasn't there (graphite web was not + # accessible) so identify that issue as a server error (500) return 500 else: @@ -512,8 +514,13 @@ class Mon(BaseCollector): return pools_to_scan - def get_pools(self): - skip_pools = ('default.rgw') + def get_rbd_pools(self): + """ + Look at the rados pools to filter out pools that would normally not + be associated with rbd images + :return: (list) of pools that may contain rbd images + """ + skip_pools = ('default.rgw', '.rgw.') start = time.time() conf_file = "/etc/ceph/{}.conf".format(self.cluster_name) @@ -529,8 +536,15 @@ class Mon(BaseCollector): return filtered_pools def _get_rbds(self, monitors): + """ + Scan a subset of the rados pools for rbd images. Each mon collector + will scan a subset of the pools to distribute the load using the + RBSScanner class + :param monitors: (dict) monitor names and states + :return total_rbs: (int) total rbd images found across pools + """ - pool_list = self.get_pools() + pool_list = self.get_rbd_pools() mon_list = sorted(monitors.keys()) my_pools = Mon._select_pools(pool_list, mon_list) self.logger.debug("Pools to be scanned on this mon" @@ -562,14 +576,18 @@ class Mon(BaseCollector): def get_stats(self): """ method associated with the plugin callback to gather the metrics - :return: (dict) metadata describing the state of the mon/osd's + :return: (dict) metadata describing the state of the mon/osd's etc """ start = time.time() - cluster_state = self.get_mon_health() - if cluster_state: + # Attempt to read the admin socket for cluster data + cluster_data = self.get_cluster_state() + + if cluster_data: + # read from the admin socket was OK, so process the data + cluster_state = self.get_mon_health(cluster_data) pool_stats = self._get_pool_stats() num_osd_hosts, osd_states = self._get_osd_states() @@ -579,6 +597,9 @@ class Mon(BaseCollector): all_stats = merge_dicts(cluster_state, {"pools": pool_stats, "osd_state": osd_states}) else: + # problem reading from the admin socket, record it in cephmetrics + # log and set the object's error flag so it can be picked up at the + # layer above the Mon instance (Ceph instance -> collectd log) all_stats = {} self.error = True msg = 'MON socket is not available...is ceph-mon active?'