]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/zabbix Added OSD discovery and per osd statistics
authorDmitriy Rabotjagov <dmitriy.r@sitevalley.com>
Mon, 28 Jan 2019 18:21:01 +0000 (20:21 +0200)
committerDmitriy Rabotjagov <dmitriy.r@sitevalley.com>
Mon, 28 Jan 2019 18:21:01 +0000 (20:21 +0200)
This commit implements OSD discovery for zabbix mgr plugin.
Now per OSD data is collected into zabbix, so it's possible to see latency
statistics per OSD, and ability to find problem OSD by this metrics.
Also it has zabbix triggers, which will inform about full or near full OSD.

Signed-off-by: Dmitriy Rabotjagov <noonedeadpunk@ya.ru>
src/pybind/mgr/zabbix/module.py
src/pybind/mgr/zabbix/zabbix_template.xml

index 78575a516e69770eda93cbc5aac1eb70276eac62..fd471eea54d7d8b5c0f102314b21374768b92030 100644 (file)
@@ -224,9 +224,11 @@ class Module(MgrModule):
         num_up = 0
         num_in = 0
         for osd in osd_map['osds']:
+            data['[osd.{0},up]'.format(int(osd['osd']))] = osd['up']
             if osd['up'] == 1:
                 num_up += 1
 
+            data['[osd.{0},in]'.format(int(osd['osd']))] = osd['in']
             if osd['in'] == 1:
                 num_in += 1
 
@@ -240,12 +242,22 @@ class Module(MgrModule):
 
         osd_stats = self.get('osd_stats')
         for osd in osd_stats['osd_stats']:
-            if osd['kb'] == 0:
+            try:
+                osd_fill.append((float(osd['kb_used']) / float(osd['kb'])) * 100)
+                data['[osd.{0},osd_fill]'.format(osd['osd'])] = (
+                    float(osd['kb_used']) / float(osd['kb'])) * 100
+            except ZeroDivisionError:
                 continue
-            osd_fill.append((float(osd['kb_used']) / float(osd['kb'])) * 100)
             osd_pgs.append(osd['num_pgs'])
             osd_apply_latency_ns.append(osd['perf_stat']['apply_latency_ns'])
             osd_commit_latency_ns.append(osd['perf_stat']['commit_latency_ns'])
+            data['[osd.{0},num_pgs]'.format(osd['osd'])] = osd['num_pgs']
+            data[
+                '[osd.{0},osd_latency_apply]'.format(osd['osd'])
+            ] = osd['perf_stat']['apply_latency_ns']  / 1000000.0 # ns -> ms
+            data[
+                '[osd.{0},osd_latency_commit]'.format(osd['osd'])
+            ] = osd['perf_stat']['commit_latency_ns']  / 1000000.0 # ns -> ms
 
         try:
             data['osd_max_fill'] = max(osd_fill)
@@ -315,24 +327,57 @@ class Module(MgrModule):
         return False
 
     def discovery(self):
-        pools = self.get('osd_map')['pools']
-        crush_rules = self.get('osd_map_crush')['rules']
+        osd_map = self.get('osd_map')
+        osd_map_crush = self.get('osd_map_crush')
 
+        # Discovering ceph pools
         pool_discovery = {
             pool['pool_name']: step['item_name']
-            for pool in pools
-            for rule in crush_rules if rule['rule_id'] == pool['crush_rule']
+            for pool in osd_map['pools']
+            for rule in osd_map_crush['rules'] if rule['rule_id'] == pool['crush_rule']
             for step in rule['steps'] if step['op'] == "take"
         }
-
-        discovery_data = {"data": []}
-        for pool, rule in pool_discovery.items():
-            discovery_data["data"].append({
+        pools_discovery_data = {"data": [
+            {
                 "{#POOL}": pool,
                 "{#CRUSH_RULE}": rule
-            })
-
-        data = {"zabbix.discovery": json.dumps(discovery_data)}
+            }
+            for pool, rule in pool_discovery.items()
+        ]}
+
+        # Discovering OSDs
+        # Getting hosts for found crush rules
+        osd_roots = {
+            step['item_name']: [
+                item['id']
+                for item in root_bucket['items']
+            ]
+            for rule in osd_map_crush['rules']
+            for step in rule['steps'] if step['op'] == "take"
+            for root_bucket in osd_map_crush['buckets']
+            if root_bucket['id'] == step['item']
+        }
+        # Getting osds for hosts with map to crush_rule
+        osd_discovery = {
+            item['id']: crush_rule
+            for crush_rule, roots in osd_roots.items()
+            for root in roots
+            for bucket in osd_map_crush['buckets']
+            if bucket['id'] == root
+            for item in bucket['items']
+        }
+        osd_discovery_data = {"data": [
+            {
+                "{#OSD}": osd,
+                "{#CRUSH_RULE}": rule
+            }
+            for osd, rule in osd_discovery.items()
+        ]}
+        # Preparing recieved data for sending
+        data = {
+            "zabbix.pool.discovery": json.dumps(pools_discovery_data),
+            "zabbix.osd.discovery": json.dumps(osd_discovery_data)
+        }
         return bool(self.send(data))
 
     def handle_command(self, inbuf, command):
@@ -363,6 +408,8 @@ class Module(MgrModule):
             if self.discovery():
                 return 0, 'Sending discovery data to Zabbix', ''
 
+            return 1, 'Failed to send discovery data to Zabbix', ''
+
         else:
             return (-errno.EINVAL, '',
                     "Command not found '{0}'".format(command['prefix']))
index 72f8366939445fe429dfa78044344f336d8b2c47..4d005280d37da07100a23bb3672623e1f960a8f1 100644 (file)
                 </item>
             </items>
             <discovery_rules>
+                <discovery_rule>
+                    <name>Ceph OSD discovery</name>
+                    <type>2</type>
+                    <snmp_community/>
+                    <snmp_oid/>
+                    <key>ceph.zabbix.osd.discovery</key>
+                    <delay>0</delay>
+                    <status>0</status>
+                    <allowed_hosts/>
+                    <snmpv3_contextname/>
+                    <snmpv3_securityname/>
+                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                    <snmpv3_authpassphrase/>
+                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                    <snmpv3_privpassphrase/>
+                    <params/>
+                    <ipmi_sensor/>
+                    <authtype>0</authtype>
+                    <username/>
+                    <password/>
+                    <publickey/>
+                    <privatekey/>
+                    <port/>
+                    <filter>
+                        <evaltype>0</evaltype>
+                        <formula/>
+                        <conditions/>
+                    </filter>
+                    <lifetime>90</lifetime>
+                    <description/>
+                    <item_prototypes>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD in</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},in]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units/>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD PGs</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},num_pgs]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units/>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD fill</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},osd_fill]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>0</value_type>
+                            <allowed_hosts/>
+                            <units>%</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD latency apply</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},osd_latency_apply]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>0</value_type>
+                            <allowed_hosts/>
+                            <units>ms</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD latency commit</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},osd_latency_commit]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>0</value_type>
+                            <allowed_hosts/>
+                            <units>ms</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[osd.{#OSD}] OSD up</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[osd.{#OSD},up]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units/>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                    </item_prototypes>
+                    <trigger_prototypes>
+                        <trigger_prototype>
+                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},up].last()}=0</expression>
+                            <recovery_mode>0</recovery_mode>
+                            <recovery_expression/>
+                            <name>Ceph OSD osd.{#OSD} is DOWN</name>
+                            <correlation_mode>0</correlation_mode>
+                            <correlation_tag/>
+                            <url/>
+                            <status>0</status>
+                            <priority>2</priority>
+                            <description/>
+                            <type>0</type>
+                            <manual_close>0</manual_close>
+                            <dependencies/>
+                            <tags/>
+                        </trigger_prototype>
+                        <trigger_prototype>
+                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}&gt;={ceph-mgr Zabbix module:ceph.osd_full_ratio.last()}</expression>
+                            <recovery_mode>0</recovery_mode>
+                            <recovery_expression/>
+                            <name>Ceph OSD osd.{#OSD} is full: {ITEM.VALUE}%</name>
+                            <correlation_mode>0</correlation_mode>
+                            <correlation_tag/>
+                            <url/>
+                            <status>0</status>
+                            <priority>4</priority>
+                            <description/>
+                            <type>0</type>
+                            <manual_close>0</manual_close>
+                            <dependencies/>
+                            <tags/>
+                        </trigger_prototype>
+                        <trigger_prototype>
+                            <expression>{ceph-mgr Zabbix module:ceph.[osd.{#OSD},osd_fill].last()}&gt;={ceph-mgr Zabbix module:ceph.osd_nearfull_ratio.last()}</expression>
+                            <recovery_mode>0</recovery_mode>
+                            <recovery_expression/>
+                            <name>Ceph OSD osd.{#OSD} is near full: {ITEM.VALUE}%</name>
+                            <correlation_mode>0</correlation_mode>
+                            <correlation_tag/>
+                            <url/>
+                            <status>0</status>
+                            <priority>2</priority>
+                            <description/>
+                            <type>0</type>
+                            <manual_close>0</manual_close>
+                            <dependencies/>
+                            <tags/>
+                        </trigger_prototype>
+                    </trigger_prototypes>
+                    <graph_prototypes/>
+                    <host_prototypes/>
+                    <jmx_endpoint/>
+                </discovery_rule>
                 <discovery_rule>
                     <name>Ceph pool discovery</name>
                     <type>2</type>
                     <snmp_community/>
                     <snmp_oid/>
-                    <key>ceph.zabbix.discovery</key>
+                    <key>ceph.zabbix.pool.discovery</key>
                     <delay>0</delay>
                     <status>0</status>
                     <allowed_hosts/>