From: Dmitriy Rabotjagov Date: Mon, 28 Jan 2019 18:21:01 +0000 (+0200) Subject: mgr/zabbix Added OSD discovery and per osd statistics X-Git-Tag: v15.1.0~2751^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6d34eab356fd74bb1cd0c3ddd74a719f53dfbc91;p=ceph.git mgr/zabbix Added OSD discovery and per osd statistics This commit implements OSD discovery for zabbix mgr plugin. Now per OSD data is collected into zabbix, so it's possible to see latency statistics per OSD, and ability to find problem OSD by this metrics. Also it has zabbix triggers, which will inform about full or near full OSD. Signed-off-by: Dmitriy Rabotjagov --- diff --git a/src/pybind/mgr/zabbix/module.py b/src/pybind/mgr/zabbix/module.py index 78575a516e69..fd471eea54d7 100644 --- a/src/pybind/mgr/zabbix/module.py +++ b/src/pybind/mgr/zabbix/module.py @@ -224,9 +224,11 @@ class Module(MgrModule): num_up = 0 num_in = 0 for osd in osd_map['osds']: + data['[osd.{0},up]'.format(int(osd['osd']))] = osd['up'] if osd['up'] == 1: num_up += 1 + data['[osd.{0},in]'.format(int(osd['osd']))] = osd['in'] if osd['in'] == 1: num_in += 1 @@ -240,12 +242,22 @@ class Module(MgrModule): osd_stats = self.get('osd_stats') for osd in osd_stats['osd_stats']: - if osd['kb'] == 0: + try: + osd_fill.append((float(osd['kb_used']) / float(osd['kb'])) * 100) + data['[osd.{0},osd_fill]'.format(osd['osd'])] = ( + float(osd['kb_used']) / float(osd['kb'])) * 100 + except ZeroDivisionError: continue - osd_fill.append((float(osd['kb_used']) / float(osd['kb'])) * 100) osd_pgs.append(osd['num_pgs']) osd_apply_latency_ns.append(osd['perf_stat']['apply_latency_ns']) osd_commit_latency_ns.append(osd['perf_stat']['commit_latency_ns']) + data['[osd.{0},num_pgs]'.format(osd['osd'])] = osd['num_pgs'] + data[ + '[osd.{0},osd_latency_apply]'.format(osd['osd']) + ] = osd['perf_stat']['apply_latency_ns'] / 1000000.0 # ns -> ms + data[ + '[osd.{0},osd_latency_commit]'.format(osd['osd']) + ] = osd['perf_stat']['commit_latency_ns'] / 1000000.0 # ns -> ms try: data['osd_max_fill'] = max(osd_fill) @@ -315,24 +327,57 @@ class Module(MgrModule): return False def discovery(self): - pools = self.get('osd_map')['pools'] - crush_rules = self.get('osd_map_crush')['rules'] + osd_map = self.get('osd_map') + osd_map_crush = self.get('osd_map_crush') + # Discovering ceph pools pool_discovery = { pool['pool_name']: step['item_name'] - for pool in pools - for rule in crush_rules if rule['rule_id'] == pool['crush_rule'] + for pool in osd_map['pools'] + for rule in osd_map_crush['rules'] if rule['rule_id'] == pool['crush_rule'] for step in rule['steps'] if step['op'] == "take" } - - discovery_data = {"data": []} - for pool, rule in pool_discovery.items(): - discovery_data["data"].append({ + pools_discovery_data = {"data": [ + { "{#POOL}": pool, "{#CRUSH_RULE}": rule - }) - - data = {"zabbix.discovery": json.dumps(discovery_data)} + } + for pool, rule in pool_discovery.items() + ]} + + # Discovering OSDs + # Getting hosts for found crush rules + osd_roots = { + step['item_name']: [ + item['id'] + for item in root_bucket['items'] + ] + for rule in osd_map_crush['rules'] + for step in rule['steps'] if step['op'] == "take" + for root_bucket in osd_map_crush['buckets'] + if root_bucket['id'] == step['item'] + } + # Getting osds for hosts with map to crush_rule + osd_discovery = { + item['id']: crush_rule + for crush_rule, roots in osd_roots.items() + for root in roots + for bucket in osd_map_crush['buckets'] + if bucket['id'] == root + for item in bucket['items'] + } + osd_discovery_data = {"data": [ + { + "{#OSD}": osd, + "{#CRUSH_RULE}": rule + } + for osd, rule in osd_discovery.items() + ]} + # Preparing recieved data for sending + data = { + "zabbix.pool.discovery": json.dumps(pools_discovery_data), + "zabbix.osd.discovery": json.dumps(osd_discovery_data) + } return bool(self.send(data)) def handle_command(self, inbuf, command): @@ -363,6 +408,8 @@ class Module(MgrModule): if self.discovery(): return 0, 'Sending discovery data to Zabbix', '' + return 1, 'Failed to send discovery data to Zabbix', '' + else: return (-errno.EINVAL, '', "Command not found '{0}'".format(command['prefix'])) diff --git a/src/pybind/mgr/zabbix/zabbix_template.xml b/src/pybind/mgr/zabbix/zabbix_template.xml index 72f836693944..4d005280d37d 100644 --- a/src/pybind/mgr/zabbix/zabbix_template.xml +++ b/src/pybind/mgr/zabbix/zabbix_template.xml @@ -1960,12 +1960,393 @@ + + Ceph OSD discovery + 2 + + + ceph.zabbix.osd.discovery + 0 + 0 + + + + 0 + 0 + + 0 + + + + 0 + + + + + + + 0 + + + + 90 + + + + [osd.{#OSD}] OSD in + 2 + + + ceph.[osd.{#OSD},in] + 0 + 90 + 365 + 0 + 3 + + + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [osd.{#OSD}] OSD PGs + 2 + + + ceph.[osd.{#OSD},num_pgs] + 0 + 90 + 365 + 0 + 3 + + + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [osd.{#OSD}] OSD fill + 2 + + + ceph.[osd.{#OSD},osd_fill] + 0 + 90 + 365 + 0 + 0 + + % + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [osd.{#OSD}] OSD latency apply + 2 + + + ceph.[osd.{#OSD},osd_latency_apply] + 0 + 90 + 365 + 0 + 0 + + ms + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [osd.{#OSD}] OSD latency commit + 2 + + + ceph.[osd.{#OSD},osd_latency_commit] + 0 + 90 + 365 + 0 + 0 + + ms + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [osd.{#OSD}] OSD up + 2 + + + ceph.[osd.{#OSD},up] + 0 + 90 + 365 + 0 + 3 + + + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + + + {ceph-mgr Zabbix module:ceph.[osd.{#OSD},up].last()}=0 + 0 + + Ceph OSD osd.{#OSD} is DOWN + 0 + + + 0 + 2 + + 0 + 0 + + + + + {ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}>={ceph-mgr Zabbix module:ceph.osd_full_ratio.last()} + 0 + + Ceph OSD osd.{#OSD} is full: {ITEM.VALUE}% + 0 + + + 0 + 4 + + 0 + 0 + + + + + {ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}>={ceph-mgr Zabbix module:ceph.osd_nearfull_ratio.last()} + 0 + + Ceph OSD osd.{#OSD} is near full: {ITEM.VALUE}% + 0 + + + 0 + 2 + + 0 + 0 + + + + + + + + Ceph pool discovery 2 - ceph.zabbix.discovery + ceph.zabbix.pool.discovery 0 0