]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/zabbix Implemets pools discovery and per-pool statistics
authorDmitriy Rabotjagov <dmitriy.r@sitevalley.com>
Fri, 25 Jan 2019 19:08:58 +0000 (21:08 +0200)
committerDmitriy Rabotjagov <dmitriy.r@sitevalley.com>
Fri, 25 Jan 2019 19:14:03 +0000 (21:14 +0200)
This commit adds possibility for zabbix mgr plugin to discover
ceph pools and send to zabbix not only generalized
but also pool-related data.

These metrics are pretty important, as it gives possibility
to collect data what exact pool uses most of the iops or diskspace

Signed-off-by: Dmitriy Rabotjagov <noonedeadpunk@ya.ru>
doc/mgr/zabbix.rst
src/pybind/mgr/zabbix/module.py
src/pybind/mgr/zabbix/zabbix_template.xml

index c4fc32a144a6955a12243203810fa4d52f4dbd51..62f513fd58a9816e309fa95d5f8e6ccc3d09e4f0 100644 (file)
@@ -68,6 +68,7 @@ Additional configuration keys which can be configured and their default values:
 - zabbix_port: 10051
 - zabbix_sender: /usr/bin/zabbix_sender
 - interval: 60
+- discovery_interval: 100
 
 Configuration keys
 ^^^^^^^^^^^^^^^^^^^
@@ -113,6 +114,14 @@ This can be done with this command:
 
 The module will now send its latest data to the Zabbix server.
 
+Items discovery is accomplished also via sabbix_sender, and runs every `discovery_interval * interval` seconds. If you wish to launch discovery 
+manually, this can be done with this command:
+
+::
+
+    ceph zabbix discovery
+
+
 Debugging
 ---------
 
index 6b428dc0acca57016f2b06b51ac1aea22ab0b241..78575a516e69770eda93cbc5aac1eb70276eac62 100644 (file)
@@ -79,6 +79,11 @@ class Module(MgrModule):
                 'name': 'interval',
                 'type': 'secs',
                 'default': 60
+            },
+            {
+                'name': 'discovery_interval',
+                'type': 'count',
+                'default': 100
             }
     ]
 
@@ -99,6 +104,11 @@ class Module(MgrModule):
             "desc": "Force sending data to Zabbix",
             "perm": "rw"
         },
+        {
+            "cmd": "zabbix discovery",
+            "desc": "Discovering Zabbix data",
+            "perm": "r"
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -117,7 +127,7 @@ class Module(MgrModule):
             raise RuntimeError('{0} is a unknown configuration '
                                'option'.format(option))
 
-        if option in ['zabbix_port', 'interval']:
+        if option in ['zabbix_port', 'interval', 'discovery_interval']:
             try:
                 value = int(value)
             except (ValueError, TypeError):
@@ -127,6 +137,12 @@ class Module(MgrModule):
         if option == 'interval' and value < 10:
             raise RuntimeError('interval should be set to at least 10 seconds')
 
+        if option == 'discovery_interval' and value < 10:
+            raise RuntimeError(
+                "discovery_interval should not be more frequent"
+                "than once in 10 regular data collection"
+            )
+
         self.log.debug('Setting in-memory config option %s to: %s', option,
                        value)
         self.config[option] = value
@@ -185,6 +201,12 @@ class Module(MgrModule):
             rd_ops += pool['stats']['rd']
             wr_bytes += pool['stats']['wr_bytes']
             rd_bytes += pool['stats']['rd_bytes']
+            data['[{0},rd_bytes]'.format(pool['name'])] = pool['stats']['rd_bytes']
+            data['[{0},wr_bytes]'.format(pool['name'])] = pool['stats']['wr_bytes']
+            data['[{0},rd_ops]'.format(pool['name'])] = pool['stats']['rd']
+            data['[{0},wr_ops]'.format(pool['name'])] = pool['stats']['wr']
+            data['[{0},bytes_used]'.format(pool['name'])] = pool['stats']['bytes_used']
+            data['[{0},raw_bytes_used]'.format(pool['name'])] = pool['stats']['raw_bytes_used']
 
         data['wr_ops'] = wr_ops
         data['rd_ops'] = rd_ops
@@ -250,9 +272,7 @@ class Module(MgrModule):
 
         return data
 
-    def send(self):
-        data = self.get_data()
-
+    def send(self, data):
         identifier = self.config['identifier']
         if identifier is None or len(identifier) == 0:
             identifier = 'ceph-{0}'.format(self.fsid)
@@ -294,6 +314,27 @@ class Module(MgrModule):
 
         return False
 
+    def discovery(self):
+        pools = self.get('osd_map')['pools']
+        crush_rules = self.get('osd_map_crush')['rules']
+
+        pool_discovery = {
+            pool['pool_name']: step['item_name']
+            for pool in pools
+            for rule in crush_rules if rule['rule_id'] == pool['crush_rule']
+            for step in rule['steps'] if step['op'] == "take"
+        }
+
+        discovery_data = {"data": []}
+        for pool, rule in pool_discovery.items():
+            discovery_data["data"].append({
+                "{#POOL}": pool,
+                "{#CRUSH_RULE}": rule
+            })
+
+        data = {"zabbix.discovery": json.dumps(discovery_data)}
+        return bool(self.send(data))
+
     def handle_command(self, inbuf, command):
         if command['prefix'] == 'zabbix config-show':
             return 0, json.dumps(self.config), ''
@@ -312,10 +353,16 @@ class Module(MgrModule):
                 'Failed to update configuration option {0}'.format(key), ''
 
         elif command['prefix'] == 'zabbix send':
-            if self.send():
+            data = self.get_data()
+            if self.send(data):
                 return 0, 'Sending data to Zabbix', ''
 
             return 1, 'Failed to send data to Zabbix', ''
+
+        elif command['prefix'] == 'zabbix discovery':
+            if self.discovery():
+                return 0, 'Sending discovery data to Zabbix', ''
+
         else:
             return (-errno.EINVAL, '',
                     "Command not found '{0}'".format(command['prefix']))
@@ -331,11 +378,25 @@ class Module(MgrModule):
 
         self.init_module_config()
 
+        discovery_interval = self.config['discovery_interval']
+        # We are sending discovery once plugin is loaded
+        discovery_counter = discovery_interval
         while self.run:
             self.log.debug('Waking up for new iteration')
 
+            if discovery_counter == discovery_interval:
+                try:
+                    self.discovery()
+                except Exception as exc:
+                    # Shouldn't happen, but let's log it and retry next interval,
+                    # rather than dying completely.
+                    self.log.exception("Unexpected error during discovery():")
+                finally:
+                    discovery_counter = 0
+
             try:
-                self.send()
+                data = self.get_data()
+                self.send(data)
             except Exception as exc:
                 # Shouldn't happen, but let's log it and retry next interval,
                 # rather than dying completely.
@@ -343,6 +404,7 @@ class Module(MgrModule):
 
             interval = self.config['interval']
             self.log.debug('Sleeping for %d seconds', interval)
+            discovery_counter += 1
             self.event.wait(interval)
 
     def self_test(self):
index 1ce76605ac07ffaf2fce7eeac9995c76664d5e81..72f8366939445fe429dfa78044344f336d8b2c47 100644 (file)
                     <logtimefmt/>
                 </item>
             </items>
-            <discovery_rules/>
+            <discovery_rules>
+                <discovery_rule>
+                    <name>Ceph pool discovery</name>
+                    <type>2</type>
+                    <snmp_community/>
+                    <snmp_oid/>
+                    <key>ceph.zabbix.discovery</key>
+                    <delay>0</delay>
+                    <status>0</status>
+                    <allowed_hosts/>
+                    <snmpv3_contextname/>
+                    <snmpv3_securityname/>
+                    <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                    <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                    <snmpv3_authpassphrase/>
+                    <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                    <snmpv3_privpassphrase/>
+                    <params/>
+                    <ipmi_sensor/>
+                    <authtype>0</authtype>
+                    <username/>
+                    <password/>
+                    <publickey/>
+                    <privatekey/>
+                    <port/>
+                    <filter>
+                        <evaltype>0</evaltype>
+                        <formula/>
+                        <conditions/>
+                    </filter>
+                    <lifetime>90</lifetime>
+                    <description/>
+                    <item_prototypes>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool Used</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},bytes_used]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>b</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool RAW Used</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},raw_bytes_used]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>b</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing/>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool Read bandwidth</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},rd_bytes]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>bytes</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing>
+                                <step>
+                                    <type>10</type>
+                                    <params/>
+                                </step>
+                            </preprocessing>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool Read operations</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},rd_ops]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>ops</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing>
+                                <step>
+                                    <type>10</type>
+                                    <params/>
+                                </step>
+                            </preprocessing>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool Write bandwidth</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},wr_bytes]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>bytes</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing>
+                                <step>
+                                    <type>10</type>
+                                    <params/>
+                                </step>
+                            </preprocessing>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                        <item_prototype>
+                            <name>[{#POOL}] Pool Write operations</name>
+                            <type>2</type>
+                            <snmp_community/>
+                            <snmp_oid/>
+                            <key>ceph.[{#POOL},wr_ops]</key>
+                            <delay>0</delay>
+                            <history>90</history>
+                            <trends>365</trends>
+                            <status>0</status>
+                            <value_type>3</value_type>
+                            <allowed_hosts/>
+                            <units>ops</units>
+                            <delta>0</delta>
+                            <snmpv3_contextname/>
+                            <snmpv3_securityname/>
+                            <snmpv3_securitylevel>0</snmpv3_securitylevel>
+                            <snmpv3_authprotocol>0</snmpv3_authprotocol>
+                            <snmpv3_authpassphrase/>
+                            <snmpv3_privprotocol>0</snmpv3_privprotocol>
+                            <snmpv3_privpassphrase/>
+                            <formula>1</formula>
+                            <delay_flex/>
+                            <params/>
+                            <ipmi_sensor/>
+                            <authtype>0</authtype>
+                            <username/>
+                            <password/>
+                            <publickey/>
+                            <privatekey/>
+                            <port/>
+                            <description/>
+                            <inventory_link>0</inventory_link>
+                            <applications>
+                                <application>
+                                    <name>Ceph</name>
+                                </application>
+                            </applications>
+                            <valuemap/>
+                            <logtimefmt/>
+                            <preprocessing>
+                                <step>
+                                    <type>10</type>
+                                    <params/>
+                                </step>
+                            </preprocessing>
+                            <jmx_endpoint/>
+                            <application_prototypes>
+                                <application_prototype>
+                                    <name>Ceph {#CRUSH_RULE}</name>
+                                </application_prototype>
+                            </application_prototypes>
+                            <master_item_prototype/>
+                        </item_prototype>
+                    </item_prototypes>
+                    <trigger_prototypes/>
+                    <graph_prototypes/>
+                    <host_prototypes/>
+                    <jmx_endpoint/>
+                </discovery_rule>
+            </discovery_rules>
             <macros/>
             <templates/>
             <screens>