From: Dmitriy Rabotjagov Date: Fri, 25 Jan 2019 19:08:58 +0000 (+0200) Subject: mgr/zabbix Implemets pools discovery and per-pool statistics X-Git-Tag: v15.1.0~2751^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4f7f73b3facf3cd57f922462bf24e7afc4714dc6;p=ceph.git mgr/zabbix Implemets pools discovery and per-pool statistics This commit adds possibility for zabbix mgr plugin to discover ceph pools and send to zabbix not only generalized but also pool-related data. These metrics are pretty important, as it gives possibility to collect data what exact pool uses most of the iops or diskspace Signed-off-by: Dmitriy Rabotjagov --- diff --git a/doc/mgr/zabbix.rst b/doc/mgr/zabbix.rst index c4fc32a144a6..62f513fd58a9 100644 --- a/doc/mgr/zabbix.rst +++ b/doc/mgr/zabbix.rst @@ -68,6 +68,7 @@ Additional configuration keys which can be configured and their default values: - zabbix_port: 10051 - zabbix_sender: /usr/bin/zabbix_sender - interval: 60 +- discovery_interval: 100 Configuration keys ^^^^^^^^^^^^^^^^^^^ @@ -113,6 +114,14 @@ This can be done with this command: The module will now send its latest data to the Zabbix server. +Items discovery is accomplished also via sabbix_sender, and runs every `discovery_interval * interval` seconds. If you wish to launch discovery +manually, this can be done with this command: + +:: + + ceph zabbix discovery + + Debugging --------- diff --git a/src/pybind/mgr/zabbix/module.py b/src/pybind/mgr/zabbix/module.py index 6b428dc0acca..78575a516e69 100644 --- a/src/pybind/mgr/zabbix/module.py +++ b/src/pybind/mgr/zabbix/module.py @@ -79,6 +79,11 @@ class Module(MgrModule): 'name': 'interval', 'type': 'secs', 'default': 60 + }, + { + 'name': 'discovery_interval', + 'type': 'count', + 'default': 100 } ] @@ -99,6 +104,11 @@ class Module(MgrModule): "desc": "Force sending data to Zabbix", "perm": "rw" }, + { + "cmd": "zabbix discovery", + "desc": "Discovering Zabbix data", + "perm": "r" + }, ] def __init__(self, *args, **kwargs): @@ -117,7 +127,7 @@ class Module(MgrModule): raise RuntimeError('{0} is a unknown configuration ' 'option'.format(option)) - if option in ['zabbix_port', 'interval']: + if option in ['zabbix_port', 'interval', 'discovery_interval']: try: value = int(value) except (ValueError, TypeError): @@ -127,6 +137,12 @@ class Module(MgrModule): if option == 'interval' and value < 10: raise RuntimeError('interval should be set to at least 10 seconds') + if option == 'discovery_interval' and value < 10: + raise RuntimeError( + "discovery_interval should not be more frequent" + "than once in 10 regular data collection" + ) + self.log.debug('Setting in-memory config option %s to: %s', option, value) self.config[option] = value @@ -185,6 +201,12 @@ class Module(MgrModule): rd_ops += pool['stats']['rd'] wr_bytes += pool['stats']['wr_bytes'] rd_bytes += pool['stats']['rd_bytes'] + data['[{0},rd_bytes]'.format(pool['name'])] = pool['stats']['rd_bytes'] + data['[{0},wr_bytes]'.format(pool['name'])] = pool['stats']['wr_bytes'] + data['[{0},rd_ops]'.format(pool['name'])] = pool['stats']['rd'] + data['[{0},wr_ops]'.format(pool['name'])] = pool['stats']['wr'] + data['[{0},bytes_used]'.format(pool['name'])] = pool['stats']['bytes_used'] + data['[{0},raw_bytes_used]'.format(pool['name'])] = pool['stats']['raw_bytes_used'] data['wr_ops'] = wr_ops data['rd_ops'] = rd_ops @@ -250,9 +272,7 @@ class Module(MgrModule): return data - def send(self): - data = self.get_data() - + def send(self, data): identifier = self.config['identifier'] if identifier is None or len(identifier) == 0: identifier = 'ceph-{0}'.format(self.fsid) @@ -294,6 +314,27 @@ class Module(MgrModule): return False + def discovery(self): + pools = self.get('osd_map')['pools'] + crush_rules = self.get('osd_map_crush')['rules'] + + pool_discovery = { + pool['pool_name']: step['item_name'] + for pool in pools + for rule in crush_rules if rule['rule_id'] == pool['crush_rule'] + for step in rule['steps'] if step['op'] == "take" + } + + discovery_data = {"data": []} + for pool, rule in pool_discovery.items(): + discovery_data["data"].append({ + "{#POOL}": pool, + "{#CRUSH_RULE}": rule + }) + + data = {"zabbix.discovery": json.dumps(discovery_data)} + return bool(self.send(data)) + def handle_command(self, inbuf, command): if command['prefix'] == 'zabbix config-show': return 0, json.dumps(self.config), '' @@ -312,10 +353,16 @@ class Module(MgrModule): 'Failed to update configuration option {0}'.format(key), '' elif command['prefix'] == 'zabbix send': - if self.send(): + data = self.get_data() + if self.send(data): return 0, 'Sending data to Zabbix', '' return 1, 'Failed to send data to Zabbix', '' + + elif command['prefix'] == 'zabbix discovery': + if self.discovery(): + return 0, 'Sending discovery data to Zabbix', '' + else: return (-errno.EINVAL, '', "Command not found '{0}'".format(command['prefix'])) @@ -331,11 +378,25 @@ class Module(MgrModule): self.init_module_config() + discovery_interval = self.config['discovery_interval'] + # We are sending discovery once plugin is loaded + discovery_counter = discovery_interval while self.run: self.log.debug('Waking up for new iteration') + if discovery_counter == discovery_interval: + try: + self.discovery() + except Exception as exc: + # Shouldn't happen, but let's log it and retry next interval, + # rather than dying completely. + self.log.exception("Unexpected error during discovery():") + finally: + discovery_counter = 0 + try: - self.send() + data = self.get_data() + self.send(data) except Exception as exc: # Shouldn't happen, but let's log it and retry next interval, # rather than dying completely. @@ -343,6 +404,7 @@ class Module(MgrModule): interval = self.config['interval'] self.log.debug('Sleeping for %d seconds', interval) + discovery_counter += 1 self.event.wait(interval) def self_test(self): diff --git a/src/pybind/mgr/zabbix/zabbix_template.xml b/src/pybind/mgr/zabbix/zabbix_template.xml index 1ce76605ac07..72f836693944 100644 --- a/src/pybind/mgr/zabbix/zabbix_template.xml +++ b/src/pybind/mgr/zabbix/zabbix_template.xml @@ -1959,7 +1959,360 @@ - + + + Ceph pool discovery + 2 + + + ceph.zabbix.discovery + 0 + 0 + + + + 0 + 0 + + 0 + + + + 0 + + + + + + + 0 + + + + 90 + + + + [{#POOL}] Pool Used + 2 + + + ceph.[{#POOL},bytes_used] + 0 + 90 + 365 + 0 + 3 + + b + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [{#POOL}] Pool RAW Used + 2 + + + ceph.[{#POOL},raw_bytes_used] + 0 + 90 + 365 + 0 + 3 + + b + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [{#POOL}] Pool Read bandwidth + 2 + + + ceph.[{#POOL},rd_bytes] + 0 + 90 + 365 + 0 + 3 + + bytes + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + 10 + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [{#POOL}] Pool Read operations + 2 + + + ceph.[{#POOL},rd_ops] + 0 + 90 + 365 + 0 + 3 + + ops + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + 10 + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [{#POOL}] Pool Write bandwidth + 2 + + + ceph.[{#POOL},wr_bytes] + 0 + 90 + 365 + 0 + 3 + + bytes + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + 10 + + + + + + + Ceph {#CRUSH_RULE} + + + + + + [{#POOL}] Pool Write operations + 2 + + + ceph.[{#POOL},wr_ops] + 0 + 90 + 365 + 0 + 3 + + ops + 0 + + + 0 + 0 + + 0 + + 1 + + + + 0 + + + + + + + 0 + + + Ceph + + + + + + + 10 + + + + + + + Ceph {#CRUSH_RULE} + + + + + + + + + + +