]> git-server-git.apps.pok.os.sepia.ceph.com Git - cephmetrics.git/commitdiff
Sleep after restart
authorBoris Ranto <branto@redhat.com>
Tue, 13 Nov 2018 14:32:03 +0000 (15:32 +0100)
committerBoris Ranto <branto@redhat.com>
Tue, 13 Nov 2018 16:01:16 +0000 (17:01 +0100)
Signed-off-by: Boris Ranto <branto@redhat.com>
95 files changed:
.gitignore [new file with mode: 0644]
INSTALL.md [new file with mode: 0644]
LICENSE [new file with mode: 0644]
README [new file with mode: 0644]
README.md [new file with mode: 0644]
TODO [new file with mode: 0644]
ansible/ansible.cfg [new file with mode: 0644]
ansible/group_vars/all.yml.sample [new file with mode: 0644]
ansible/inventory.sample [new file with mode: 0644]
ansible/playbook.yml [new file with mode: 0644]
ansible/purge.yml [new file with mode: 0644]
ansible/roles/ceph-dashboard/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-dashboard/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-dashboard/tasks/configure_dashboard.yml [new file with mode: 0644]
ansible/roles/ceph-dashboard/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-dashboard/tasks/merge_vars.yml [new symlink]
ansible/roles/ceph-defaults/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-defaults/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-defaults/tasks/merge_vars.yml [new file with mode: 0644]
ansible/roles/ceph-defaults/tasks/setup_repos.yml [new file with mode: 0644]
ansible/roles/ceph-docker/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-docker/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-docker/tasks/install_packages.yml [new file with mode: 0644]
ansible/roles/ceph-docker/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-docker/tasks/merge_vars.yml [new symlink]
ansible/roles/ceph-docker/tasks/start_services.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/files/dashboards [new symlink]
ansible/roles/ceph-grafana/files/grafana-server.service [new file with mode: 0644]
ansible/roles/ceph-grafana/files/grafana.list [new file with mode: 0644]
ansible/roles/ceph-grafana/handlers/main.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/configure_firewall.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/configure_grafana.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/grafana_plugins.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/install_packages.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/tasks/merge_vars.yml [new symlink]
ansible/roles/ceph-grafana/tasks/setup_container.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml [new file with mode: 0644]
ansible/roles/ceph-grafana/templates/grafana.ini [new file with mode: 0644]
ansible/roles/ceph-grafana/templates/grafana.repo [new file with mode: 0644]
ansible/roles/ceph-mgr/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-mgr/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-mgr/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-mgr/tasks/merge_vars.yml [new symlink]
ansible/roles/ceph-node-exporter/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/files/node_exporter.service [new file with mode: 0644]
ansible/roles/ceph-node-exporter/handlers/main.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/tasks/install_packages.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/tasks/merge_vars.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/tasks/setup_container.yml [new file with mode: 0644]
ansible/roles/ceph-node-exporter/templates/sysconfig [new file with mode: 0644]
ansible/roles/ceph-prometheus/defaults/main.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/files/prometheus.service [new file with mode: 0644]
ansible/roles/ceph-prometheus/handlers/main.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/meta/main.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/tasks/install_packages.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/tasks/main.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/tasks/merge_vars.yml [new symlink]
ansible/roles/ceph-prometheus/tasks/setup_container.yml [new file with mode: 0644]
ansible/roles/ceph-prometheus/templates/prometheus.yml [new file with mode: 0644]
build_srpm [new file with mode: 0755]
dashboard-ansible.spec.in [new file with mode: 0644]
dashboards/README [new file with mode: 0644]
dashboards/ceph-cluster.json [new file with mode: 0644]
dashboards/cephfs-overview.json [new file with mode: 0644]
dashboards/host-details.json [new file with mode: 0644]
dashboards/hosts-overview.json [new file with mode: 0644]
dashboards/osd-device-details.json [new file with mode: 0644]
dashboards/osds-overview.json [new file with mode: 0644]
dashboards/pool-detail.json [new file with mode: 0644]
dashboards/pool-overview.json [new file with mode: 0644]
dashboards/radosgw-detail.json [new file with mode: 0644]
dashboards/radosgw-overview.json [new file with mode: 0644]
patches/0001-ansible-Disable-devel_mode.patch [new file with mode: 0644]
screenshots/archive/dashboard-2017-05-19.png [new file with mode: 0644]
screenshots/archive/dashboard-2017-05-24.png [new file with mode: 0644]
screenshots/archive/dashboard-2017-05-26.png [new file with mode: 0644]
screenshots/archive/dashboard-2017-05-29.png [new file with mode: 0644]
screenshots/at-a-glance.png [new file with mode: 0644]
screenshots/ceph-backend.png [new file with mode: 0644]
screenshots/ceph-frontend.png [new file with mode: 0644]
screenshots/ceph-rados.png [new file with mode: 0644]
screenshots/ceph-rgw.png [new file with mode: 0644]
screenshots/disk-busy-by-server.png [new file with mode: 0644]
screenshots/disk-latency-by-server.png [new file with mode: 0644]
screenshots/iops-by-server.png [new file with mode: 0644]
screenshots/network-load.png [new file with mode: 0644]
screenshots/osd-node-details.png [new file with mode: 0644]
tests/testosd.py [new file with mode: 0644]
tox.ini [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..a7079d9
--- /dev/null
@@ -0,0 +1,110 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# vim swap files
+*.swp
+
+# Python virtualenv
+virtualenv/
+
+# ansible retry files
+*.retry
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644 (file)
index 0000000..4ca0c09
--- /dev/null
@@ -0,0 +1,125 @@
+# Installation Process
+
+## Objective:   
+Run a Grafana instance to provide a monitoring dashboard to a ceph
+cluster.
+
+## Pre-requisites    
+### Monitoring host  
+- docker and docker-compose (for simplicity)  
+- grafana image (official latest 4.3 release from docker hub)  
+- graphite image (docker.io/abezhenar/graphite-centos7) 
+- clone the cephmetrics repo (docker configuration, dashboards)
+- host that will run the monitor should have passwordless ssh to all the ceph
+nodes
+- the storage for the graphite database should be on SSD/flash if possible
+- needs PyYAML, tested with python 2.7.13
+
+### Ceph Cluster Nodes
+- collectd rpm (5.7 or above)
+
+## Installation Sequence
+Install the monitoring endpoint first, and then apply the collectd configuration
+to each of the ceph nodes.  
+
+
+## Setting Up the monitoring endpoint
+On the monitoring host, perform the following steps;  
+1. Pull the required docker images (*listed above*)   
+2. we need to persist the grafana configuration db and settings, as well as the 
+graphite data.  
+```markdown
+mkdir -p /opt/docker/grafana/etc
+mkdir -p /opt/docker/grafana/data/plugins
+mkdir -p /opt/docker/graphite
+```
+3. Download the additional status panel plugin
+```markdown
+cd /opt/docker/grafana/data/plugins
+wget https://grafana.com/api/plugins/vonage-status-panel/versions/1.0.4/download
+unzip download
+rm -f download
+```
+4. Copy the seed .ini file for grafana to the containers etc directory, and reset
+the permissions to be compatible with the containers
+```markdown
+cp etc/grafana/grafana.ini /opt/docker/grafana/etc
+chown -R 104:107 /opt/docker/grafana
+chown -R 997 /opt/docker/graphite
+chmod g+w /opt/docker/graphite
+
+```
+5. Edit the docker-compose.yml example (if necessary)
+6. From the directory with the compose file, issue  
+```
+docker-compose up -d
+```
+7. check that the containers are running and the endpoints are listening  
+7.1 Use ```docker ps```  
+7.2 use ```netstat``` and look for the following ports: 3000,80,2003,2004,7002  
+7.3 open a browser and connect to graphite - it should be running on port 80 of
+the local machine
+8. Add the graphite instance as a datasource to grafana  
+8.1 update setup/add_datasource.json with the IP of the host machine  
+8.2 register the graphite instance to grafana as the default data source  
+```markdown
+curl -u admin:admin -H "Content-Type: application/json" -X POST http://localhost:3000/api/datasources \
+--data-binary @setup/add_datasource.json
+```
+9. Install the grafana labs pie-chart plugin   
+9.1 open a shell session to the grafana instance, and install the plugin  
+```markdown
+docker exec -it grafana bash
+grafana-cli plugins install grafana-piechart-panel
+```
+10. the sample dashboards need to be added/edited to reflect the ceph cluster to
+monitor  
+10.1 seed dashboards are provided in the dashboards/current directory   
+10.2 edit ```dashboard.yml``` with the shortnames of the OSD's and RGW's, plus
+the dns domain name of the environment.  
+10.3 run the following command  
+```markdown
+python dashUpdater.py
+```
+  
+  
+### Updating the dashboards
+After adding ceph nodes to the configuration, update the ```dashboard.yml``` 
+file, and then rerun the ```dashUpdater.py``` script.
+
+
+## Configuration on Each Ceph Node  
+You may need to update your SELINUX policy to allow the write_graphite plugin
+to access outbound on port 2003. To test, simply disable SELINUX  
+1. install collectd (this will also require libcollectdclient)
+2. create the required directories for the cephmetrics collectors (see known
+issues [2])
+```markdown
+mkdir -p /usr/lib64/collectd/python-plugins/collectors
+```
+3. copy the collectors to the directory created in [2], and cephmetrics.py
+to /usr/lib64/collectd/python-plugins
+4. Setup the collectd plugins  
+4.1 Update the write_graphite.conf file to specify the hostname where the 
+grafana/graphite environment is (use a hostname not IP - anecdotally I found that
+with an IP the plugin fails to connect to the graphite container port?)    
+4.2 copy the example plugin files to the /etc/collectd.d directory (i.e. cpu.conf,
+memory.conf etc)  
+5. update the "ClusterName" parameter in the cephmetrics plugin file to match
+ the name of your ceph cluster (default is 'ceph')
+6. copy the example collectd.conf file to the ceph node (or update the existing
+configuration to ensure there is a ```Include "/etc/collectd.d/*.conf"``` entry)
+7. enable collectd
+8. start collectd
+9. check collectd is running without errors
+
+## Known Issues
+1. Following a reboot of an OSD node, the cephmetrics collectd plugin doesn't send disk 
+stats. ***Workaround**: Following the reboot of an OSD, restart the collectd service.*  
+2. the cephmetrics.py and collectors should be installed through python-setuptools to cut down on 
+the installation steps.  
+3. SELINUX may block the write_graphite plugin writing outbound on port 2003
+
+
+
+
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..65c5ca8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..b6fd253
--- /dev/null
+++ b/README
@@ -0,0 +1,58 @@
+On the ceph node install collectd
+- tested with collectd-5.7.0-4.el7ost.x86_64
+
+Interval is set to 10 in collectd.conf
+
+write_graphite plugin configured as follows;
+
+LoadPlugin "write_graphite"
+<Plugin write_graphite>
+  <Node "local">
+    Host "192.168.1.52"
+    Port "2003"
+    Protocol "tcp"
+    LogSendErrors true
+    Prefix "collectd."
+    StoreRates true
+    AlwaysAppendDS false
+    EscapeCharacter "_"
+    PreserveSeparator true
+    SeparateInstances true
+  </Node>
+</Plugin>
+
+5.7 introduces the PreserveSeparator parm, allowing the type instance name to
+contain '.'. At the moment I used .'s in the metric name since the same plugin
+provides all metrics.
+
+Comments welcome!
+
+This is what an entry looks like as seen in influx
+collectd.obj-mon-1.storage.lab.cephmetrics.gauge.ceph.pools._rgw_root.num_bytes_recovered
+        |                     |            \     \    \     \
+        |                     |             \     \    \     \
+        |                     |              |     \    \      \
+prefix  | hostname            |  plugin name | type |ceph|metric| metric name
+                                                     name  group
+
+In the case of pools, the metric name is prefixed by the pool name
+
+Added cephmetrics.conf to /etc/collectd.d dir
+
+mkdir -p /usr/lib64/collectd/python-plugins
+
+to the plugin dir, I copied
+ - cephmetrics.py
+ - collectors dir
+
+
+Known Issues
+1. Singlestat panels don't track the graph panel values 100% of the time
+
+
+
+
+Container Configuration - pulled from docker.io registry
+- grafana - grafana/grafana (official image)
+- graphite - nickstenning/graphite - https://hub.docker.com/r/nickstenning/graphite/
+
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..efe3413
--- /dev/null
+++ b/README.md
@@ -0,0 +1,82 @@
+# cephmetrics
+
+Cephmetrics is a tool that allows a user to visually monitor various metrics in a running Ceph cluster.
+
+## Prerequisites
+- RHEL 7 should be running on all hosts
+- A functional ceph cluster running version ceph-osd-10.2.7-27.el7cp.x86_64 or later is already up and running.
+- Another host machine independent of the ceph machines must be available.  This host will be used to receive data pushed by the hosts in the Ceph cluster, and will run the dashboard to display that data.
+- A host machine on which to execute `ansible-playbook` to orchestrate the deployment must be available.
+- Passwordless SSH access from the deploy host to the ceph hosts.  The username should be the same for all hosts.
+- Passwordless sudo access on the ceph and dashboard hosts
+- All hosts must share the same DNS domain
+
+## Resulting configuration
+
+After running this procedure, you will have the following configuration.
+- The ceph nodes will have `collectd` installed, along with collector plugins from `cephmetrics-collectd`
+- The dashboard host will have `grafana` installed and configured to display various dashboards by querying data received from Ceph nodes via a `graphite-web`, `python-carbon`, and `python-whisper` stack.
+
+## Installation
+
+### Install cephmetrics-ansible
+
+First, decide which machine you want to use to run `ansible-playbook`.  If you used [`ceph-ansible`](https://github.com/ceph/ceph-ansible) to set up your cluster, you may want to reuse that same host to take advantage of the inventory file that was created as part of that process.
+
+Once the host is selected, perform the following steps there.  This will install a repo which includes the cephmetrics installation code and ansible (version 2.2.3 or later):
+```
+sudo su -
+mkdir ~/cephmetrics
+subscription-manager repos --enable rhel-7-server-optional-rpms --enable rhel-7-server-rhscon-2-installer-rpms
+curl -L -o /etc/yum.repos.d/cephmetrics.repo http://download.ceph.com/cephmetrics/rpm-master/el7/cephmetrics.repo
+yum install cephmetrics-ansible
+```
+
+### Create or edit the inventory file
+
+Next, we need an inventory file.  If you are running `ansible-playbook` on a host that previously ran `ceph-ansible`, you may simply modify `/etc/ansible/hosts`; otherwise you may copy `/usr/share/cephmetrics-ansible/inventory.sample` and modify it if you wish.
+
+The inventory file format looks like:
+
+    [ceph-grafana]
+    grafana_host.example.com
+
+    [osds]
+    osd0.example.com
+    osd1.example.com
+    osd2.example.com
+
+    [mons]
+    mon0.example.com
+    mon1.example.com
+    mon2.example.com
+
+    [mdss]
+    mds0.example.com
+
+    [rgws]
+    rgw0.example.com
+
+If you are running `ansible-playbook` on a host mentioned in the inventory file, you will need to append `ansible_connection=local` to each line in the inventory file that mentions that host.  An example:
+    ```
+    my_host.example.com ansible_connection=local
+    ```
+Omit the mdss section if no ceph mds nodes are installed.  Omit the rgws section if no rgw nodes are installed.
+
+Ansible variables can be set in a `vars.yml` file if necessary.  If it is required, make sure to add `-e '@/path/to/vars.yml` to your `ansible-playbook` invocation below.  [Click here](./ansible/README.md) for more information.
+
+## Deploy via ansible-playbook
+
+If you are using a `ceph-ansible` host, run these commands:
+```
+cd /usr/share/cephmetrics-ansible
+ansible-playbook -v playbook.yml
+```
+
+Otherwise, run these commands:
+```
+cd /usr/share/cephmetrics-ansible
+ansible-playbook -v -i /path/to/inventory playbook.yml
+```
+
+Note: The reason it is necessary to change directories is so that `ansible-playbook` will use the bundled `ansible.cfg`; there is currently no command-line argument allowing the specification of an arbitrary `.cfg` file.
diff --git a/TODO b/TODO
new file mode 100644 (file)
index 0000000..cc52402
--- /dev/null
+++ b/TODO
@@ -0,0 +1,25 @@
+
+collectd
+- add network and cpu to all deployments
+- define standard easy roll-out conf (use collectd.d for write_graphite, cephmetrics, cpu and network)
+
+Dashboard
+
+
+collectd : cephmetrics
+- add metrics starting message so you know collection is active
+
+Python Modules
+Mon
+-
+
+RGW
+-
+
+
+
+Completed Items
+05/24 RGW: implement the latencies as different metrics to allow summarisation at the db layer
+05/24 MON: add ceph health status (OK/WARN) to output dict
+05/24 Dashboard: Add ceph health text
+05/29 OSD: OSD metrics added, with dashboard updates
\ No newline at end of file
diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg
new file mode 100644 (file)
index 0000000..2f4d621
--- /dev/null
@@ -0,0 +1,7 @@
+[defaults]
+roles_path: ./roles/
+host_key_checking: False
+forks: 50
+
+[ssh_connection]
+pipelining=True
diff --git a/ansible/group_vars/all.yml.sample b/ansible/group_vars/all.yml.sample
new file mode 100644 (file)
index 0000000..0e4e36f
--- /dev/null
@@ -0,0 +1,27 @@
+dummy:
+
+## Choose protocol -- http or https
+# For https, you should set grafana.crt/key and dashboard.crt/key
+#
+# If you don't, the scripts will use self-signed certificates and
+# you will have to confirm security exceptions for both dashboard
+# web UI and Grafana.
+#protocol: http
+
+## Set grafana options
+# For new deployments, you can directly set the user/password,here.
+# Please change user/pwd in the web UI on an already deployed machine
+# before changing them, here.
+#grafana:
+#  admin_user: admin
+#  admin_password: admin
+#  crt: ''
+#  key: ''
+
+## Set dashboard options
+#dashboard:
+#  admin_user: admin
+#  admin_password: admin
+#  crt: ''
+#  key: ''
+#  port: 8234
diff --git a/ansible/inventory.sample b/ansible/inventory.sample
new file mode 100644 (file)
index 0000000..99fa70e
--- /dev/null
@@ -0,0 +1,18 @@
+[ceph-dashboard]
+grafana_host.example.com
+
+[osds]
+osd0.example.com
+osd1.example.com
+osd2.example.com
+
+[mons]
+mon0.example.com
+mon1.example.com
+mon2.example.com
+
+[mdss]
+mds0.example.com
+
+[rgws]
+rgw0.example.com
diff --git a/ansible/playbook.yml b/ansible/playbook.yml
new file mode 100644 (file)
index 0000000..add9eaf
--- /dev/null
@@ -0,0 +1,51 @@
+---
+- hosts: all
+  gather_facts: true
+  any_errors_fatal: true
+
+- hosts:
+  - mgrs
+  become: true
+  tasks:
+    - name: Restart ceph-mgr services to make sure they are properly loaded
+      shell: |
+        systemctl restart ceph-mgr*.service
+
+- hosts:
+  - mgrs[0]
+  become: true
+  roles:
+  - ceph-mgr
+
+- hosts:
+  - ceph-grafana
+  # These are roles used by ceph-ansible
+  - mons
+  - agents
+  - osds
+  - mdss
+  - rgws
+  - nfss
+  - restapis
+  - rbdmirrors
+  - clients
+  - mgrs
+  - iscsis
+  # This role is (so far) only used for testing
+  - cluster
+  become: true
+  roles:
+  - ceph-node-exporter
+
+- hosts:
+  - ceph-grafana
+  become: true
+  roles:
+  - ceph-prometheus
+  - ceph-grafana
+
+- hosts:
+  - mgrs[0]
+  become: true
+  roles:
+  - ceph-dashboard
diff --git a/ansible/purge.yml b/ansible/purge.yml
new file mode 100644 (file)
index 0000000..f08b2e1
--- /dev/null
@@ -0,0 +1,121 @@
+---
+- name: purge grafana host
+  hosts:
+    - ceph-grafana
+  become: true
+  tasks:
+  - name: Stop and disable services
+    service:
+      name: "{{ item }}"
+      enabled: no
+      state: stopped
+    with_items:
+      - grafana-server
+      - carbon-cache
+      - httpd
+      - node_exporter
+      - prometheus-node-exporter
+    failed_when: false
+
+  - name: Remove packages
+    package:
+      name: "{{ item }}"
+      state: absent
+    with_items:
+      - graphite-web
+      - python-carbon
+      - grafana
+      - cephmetrics
+      - prometheus
+      - ceph-grafana-dashboards
+
+  - name: Remove files
+    file:
+      dest: "{{ item }}"
+      state: absent
+    with_items:
+      - /var/lib/graphite
+      - /var/lib/graphite-web
+      - /var/lib/grafana
+      - /var/lib/carbon
+      - /etc/grafana/grafana.ini
+      - /etc/grafana/ceph-dashboard.crt
+      - /etc/grafana/ceph-dashboard.key
+      - /etc/grafana/provisioning/dashboards/ceph-dashboard.yml
+      - /etc/grafana/provisioning/datasources/ceph-dashboard.yml
+      - /etc/grafana/dashboards/ceph-dashboard
+      - /etc/carbon/storage-schemas.conf
+      - /etc/httpd/conf.d/graphite-web.conf
+      - /etc/yum.repos.d/cephmetrics.repo
+      - /etc/yum.repos.d/cephmetrics-custom.repo
+      - /etc/yum.repos.d/grafana.repo
+      - /tmp/dashboard.yml
+      - /tmp/dashUpdater.py
+      - /tmp/dashboards
+
+  - name: Remove containers
+    docker_container:
+      name: "{{ item }}"
+      state: absent
+    with_items:
+      - grafana-server
+      - prometheus
+
+- name: purge all the hosts
+  hosts:
+    # These are roles used by ceph-ansible
+    - mons
+    - agents
+    - osds
+    - mdss
+    - rgws
+    - nfss
+    - restapis
+    - rbdmirrors
+    - clients
+    - mgrs
+    # This role is (so far) only used for testing
+    - cluster
+  become: true
+  tasks:
+  - name: Stop and disable services
+    service:
+      name: "{{ item }}"
+      enabled: no
+      state: stopped
+    with_items:
+      - collectd
+      - node_exporter
+      - prometheus-node-exporter
+    failed_when: false
+
+  - name: Remove packages
+    package:
+      name: "{{ item }}"
+      state: absent
+    with_items:
+      - cephmetrics-collectors
+      - collectd
+      - prometheus-node_exporter
+
+  - name: Remove files
+    file:
+      dest: "{{ item }}"
+      state: absent
+    with_items:
+      - /etc/collectd.d/cephmetrics.conf
+      - /etc/collectd.d/cpu.conf
+      - /etc/collectd.d/memory.conf
+      - /etc/collectd.d/nics.conf
+      - /etc/collectd.d/write_graphite.conf
+      - /etc/collectd.conf
+      - /etc/yum.repos.d/cephmetrics.repo
+      - /usr/lib64/collectd
+      - /usr/local/sbin/node_exporter
+
+  - name: Remove containers
+    docker_container:
+      name: "{{ item }}"
+      state: absent
+    with_items:
+      - node-exporter
diff --git a/ansible/roles/ceph-dashboard/defaults/main.yml b/ansible/roles/ceph-dashboard/defaults/main.yml
new file mode 100644 (file)
index 0000000..de8591e
--- /dev/null
@@ -0,0 +1,26 @@
+---
+defaults:
+  dashboard:
+    admin_user: admin
+    admin_password: admin
+    rgw_api_user_id: ceph-dashboard
+    rgw_api_host: ''
+    rgw_api_port: ''
+    rgw_api_scheme: ''
+    rgw_api_admin_resource: ''
+    rgw_api_no_ssl_verify: ''
+devel_packages:
+  yum:
+    # unzip is needed to extract the Vonage plugin
+    - unzip
+    - grafana
+    # for dashUpdater.py
+    - PyYAML
+    - python-requests
+  apt:
+    # unzip is needed to extract the Vonage plugin
+    - unzip
+    - grafana
+    # for dashUpdater.py
+    - python-yaml
+    - python-requests
diff --git a/ansible/roles/ceph-dashboard/meta/main.yml b/ansible/roles/ceph-dashboard/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-dashboard/tasks/configure_dashboard.yml b/ansible/roles/ceph-dashboard/tasks/configure_dashboard.yml
new file mode 100644 (file)
index 0000000..95246cc
--- /dev/null
@@ -0,0 +1,137 @@
+---
+- name: Check to see if the mgr is containerized
+  command: "docker inspect {{ item }}"
+  with_items:
+    - "ceph-mgr@{{ ansible_hostname }}"
+    - "ceph-mgr-{{ ansible_hostname }}"
+  register: mgr_container
+  failed_when: false
+
+- name: Choose the correct container name
+  set_fact:
+    container_name: "{% for res in mgr_container.results if res.rc == 0 %}{{ res.item }}{% endfor %}"
+    mgr_prefix: "" # Set the default value for mgr_prefix
+
+- name: Prefix the mgr command with a docker command
+  set_fact:
+    mgr_prefix: "docker exec {{ container_name }}"
+  when: container_name != ""
+
+- name: Disable SSL for dashboard
+  shell: |
+    {{ mgr_prefix }} ceph config set mgr mgr/dashboard/ssl false || \
+    {{ mgr_prefix }} ceph config-key set mgr/dashboard/ssl false
+  when: protocol != "https"
+
+- name: Enable SSL for dashboard
+  shell: |
+    {{ mgr_prefix }} ceph config set mgr mgr/dashboard/ssl true || \
+    {{ mgr_prefix }} ceph config-key set mgr/dashboard/ssl true
+  when: protocol == "https"
+
+- name: Copy dashboard SSL certificate file
+  copy:
+    src: "{{ dashboard.crt }}"
+    dest: "/etc/ceph/ceph-dashboard.crt"
+    owner: root
+    group: root
+    mode: 0644
+  when:
+    - dashboard.crt
+    - protocol == "https"
+
+- name: Copy dashboard SSL certificate key
+  copy:
+    src: "{{ dashboard.key }}"
+    dest: "/etc/ceph/ceph-dashboard.key"
+    owner: root
+    group: root
+    mode: 0644
+  when:
+    - dashboard.key
+    - protocol == "https"
+
+- name: Generate a Self Signed OpenSSL certificate for dashboard
+  shell: |
+    test -f /etc/ceph/ceph-dashboard.key -a -f /etc/ceph/ceph-dashboard.crt || \
+    openssl req -new -nodes -x509 -subj '/O=IT/CN=ceph-dashboard' -days 3650 -keyout /etc/ceph/ceph-dashboard.key -out /etc/ceph/ceph-dashboard.crt -extensions v3_ca
+  when:
+    - protocol == "https"
+    - not dashboard.key or not dashboard.crt
+
+- name: Import dashboard certificate file
+  command: "{{ mgr_prefix }} ceph config-key set mgr/dashboard/crt -i /etc/ceph/ceph-dashboard.crt"
+  when: protocol == "https"
+
+- name: Import dashboard certificate key
+  command: "{{ mgr_prefix }} ceph config-key set mgr/dashboard/key -i /etc/ceph/ceph-dashboard.key"
+  when: protocol == "https"
+
+- name: "Set the dashboard port ({{ dashboard.port }})"
+  shell: |
+    {{ mgr_prefix }} ceph config set mgr mgr/dashboard/server_port {{ dashboard.port }} || \
+    {{ mgr_prefix }} ceph config-key set mgr/dashboard/server_port {{ dashboard.port }}
+
+- name: Disable mgr dashboard module (restart)
+  command: "{{ mgr_prefix }} ceph mgr module disable dashboard"
+
+- name: Enable mgr dashboard module (restart)
+  command: "{{ mgr_prefix }} ceph mgr module enable dashboard"
+
+- name: Wait a while for mgr module to come up
+  command: "sleep 3"
+
+- name: Set or update dashboard admin username and password
+  shell: |
+    if {{ mgr_prefix }} ceph dashboard ac-user-show {{ dashboard.admin_user }}; then
+      {{ mgr_prefix }} ceph dashboard ac-user-set-password {{ dashboard.admin_user }} {{ dashboard.admin_password }}
+    else
+      {{ mgr_prefix }} ceph dashboard ac-user-create {{ dashboard.admin_user }} {{ dashboard.admin_password }} administrator
+    fi
+
+- name: Set grafana url
+  command: "{{ mgr_prefix }} ceph dashboard set-grafana-api-url {{ protocol }}://{{ groups['ceph-grafana'][0] }}:3000/"
+
+- name: Create radosgw system user
+  command: "{{ mgr_prefix }} radosgw-admin user create --uid={{ dashboard.rgw_api_user_id }} --display-name='Ceph dashboard' --system"
+  register: rgw_user_output
+
+- name: Get the rgw access and secret keys
+  set_fact:
+    rgw_access_key: "{{ (rgw_user_output.stdout | from_json)['keys'][0]['access_key'] }}"
+    rgw_secret_key: "{{ (rgw_user_output.stdout | from_json)['keys'][0]['secret_key'] }}"
+
+- name: Set the rgw user
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-user-id {{ dashboard.rgw_api_user_id }}"
+
+- name: Set the rgw access key
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-access-key {{ rgw_access_key }}"
+
+- name: Set the rgw secret key
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-secret-key {{ rgw_secret_key }}"
+
+- name: Set the rgw host
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-host {{ dashboard.rgw_api_host }}"
+  when: dashboard.rgw_api_host
+
+- name: Set the rgw port
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-port {{ dashboard.rgw_api_port }}"
+  when: dashboard.rgw_api_port
+
+- name: Set the rgw scheme
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-scheme {{ dashboard.rgw_api_scheme }}"
+  when: dashboard.rgw_api_scheme
+
+- name: Set the rgw admin resource
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-admin-resource {{ dashboard.rgw_api_admin_resource }}"
+  when: dashboard.rgw_api_admin_resource
+
+- name: Disable ssl verification for rgw
+  command: "{{ mgr_prefix }} ceph dashboard set-rgw-api-ssl-verify False"
+  when: dashboard.rgw_api_no_ssl_verify
+
+- name: Disable mgr dashboard module (restart)
+  command: "{{ mgr_prefix }} ceph mgr module disable dashboard"
+
+- name: Enable mgr dashboard module (restart)
+  command: "{{ mgr_prefix }} ceph mgr module enable dashboard"
diff --git a/ansible/roles/ceph-dashboard/tasks/main.yml b/ansible/roles/ceph-dashboard/tasks/main.yml
new file mode 100644 (file)
index 0000000..81b162d
--- /dev/null
@@ -0,0 +1,8 @@
+---
+- include: merge_vars.yml
+
+- include: configure_dashboard.yml
+
+- name: Print dashboard URL
+  debug:
+    msg: "The dashboard has been deployed! You can access your dashboard web UI at {{ protocol }}://{{ ansible_hostname }}:{{ dashboard.port }}/ as an '{{ dashboard.admin_user }}' user with '{{ dashboard.admin_password }}' password."
diff --git a/ansible/roles/ceph-dashboard/tasks/merge_vars.yml b/ansible/roles/ceph-dashboard/tasks/merge_vars.yml
new file mode 120000 (symlink)
index 0000000..299adff
--- /dev/null
@@ -0,0 +1 @@
+../../ceph-defaults/tasks/merge_vars.yml
\ No newline at end of file
diff --git a/ansible/roles/ceph-defaults/defaults/main.yml b/ansible/roles/ceph-defaults/defaults/main.yml
new file mode 100644 (file)
index 0000000..ce928c0
--- /dev/null
@@ -0,0 +1,23 @@
+---
+defaults:
+  containerized: true
+  devel_mode: true
+  # The firewalld zone that carbon and grafana will use
+  firewalld_zone: public
+  # Choose http or https
+  # For https, you should set grafana.crt/key and dashboard.crt/key
+  protocol: http
+  # We need this for both grafana and the mgr-dashboard module
+  grafana:
+    # You need to change these in the web UI on an already deployed machine, first
+    # New deployments work fine
+    admin_user: admin
+    admin_password: admin
+    # We only need this for SSL (https) connections
+    crt: ''
+    key: ''
+  dashboard:
+    # We only need this for SSL (https) connections
+    crt: ''
+    key: ''
+    port: 8234
diff --git a/ansible/roles/ceph-defaults/tasks/main.yml b/ansible/roles/ceph-defaults/tasks/main.yml
new file mode 100644 (file)
index 0000000..d91c420
--- /dev/null
@@ -0,0 +1,6 @@
+---
+- include: merge_vars.yml
+
+- include: setup_repos.yml
+  when:
+    - devel_mode
diff --git a/ansible/roles/ceph-defaults/tasks/merge_vars.yml b/ansible/roles/ceph-defaults/tasks/merge_vars.yml
new file mode 100644 (file)
index 0000000..f8dbcd0
--- /dev/null
@@ -0,0 +1,5 @@
+---
+- name: Combine default settings and user-defined variables
+  set_fact: {"{{ item }}": "{% if vars[item] is not defined %}{{ defaults[item] }}{% elif vars[item] is mapping %}{{ defaults[item]|combine(vars[item]|default({})) }}{% else %}{{ vars[item] }}{% endif %}"}
+  with_items: "{{ defaults.keys() }}"
+  no_log: true
diff --git a/ansible/roles/ceph-defaults/tasks/setup_repos.yml b/ansible/roles/ceph-defaults/tasks/setup_repos.yml
new file mode 100644 (file)
index 0000000..e351f6d
--- /dev/null
@@ -0,0 +1,13 @@
+---
+- name: Enable EPEL
+  yum:
+    name: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
+    state: present
+  when:
+    - ansible_pkg_mgr == "yum"
+
+- name: Update apt cache
+  apt:
+    update_cache: true
+  when:
+    - ansible_pkg_mgr == 'apt'
diff --git a/ansible/roles/ceph-docker/defaults/main.yml b/ansible/roles/ceph-docker/defaults/main.yml
new file mode 100644 (file)
index 0000000..e31960c
--- /dev/null
@@ -0,0 +1,12 @@
+---
+defaults:
+  docker:
+    packages:
+      yum:
+        - docker
+        - docker-python
+      apt:
+        - docker.io
+        - python-docker
+    network_name: cephmetrics
+    service_name: docker
diff --git a/ansible/roles/ceph-docker/meta/main.yml b/ansible/roles/ceph-docker/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-docker/tasks/install_packages.yml b/ansible/roles/ceph-docker/tasks/install_packages.yml
new file mode 100644 (file)
index 0000000..1c423fa
--- /dev/null
@@ -0,0 +1,6 @@
+---
+- name: Install packages
+  package:
+    name: "{{ item }}"
+    state: latest
+  with_items: "{{ docker.packages[ansible_pkg_mgr] }}"
diff --git a/ansible/roles/ceph-docker/tasks/main.yml b/ansible/roles/ceph-docker/tasks/main.yml
new file mode 100644 (file)
index 0000000..2522c14
--- /dev/null
@@ -0,0 +1,16 @@
+---
+- include: merge_vars.yml
+
+- name: End play if no containers are desired
+  meta: end_play
+  when:
+    - not containerized
+
+- include: install_packages.yml
+
+- include: start_services.yml
+
+- name: Create a network
+  docker_network:
+    name: "{{ docker.network_name }}"
+    driver: bridge
diff --git a/ansible/roles/ceph-docker/tasks/merge_vars.yml b/ansible/roles/ceph-docker/tasks/merge_vars.yml
new file mode 120000 (symlink)
index 0000000..299adff
--- /dev/null
@@ -0,0 +1 @@
+../../ceph-defaults/tasks/merge_vars.yml
\ No newline at end of file
diff --git a/ansible/roles/ceph-docker/tasks/start_services.yml b/ansible/roles/ceph-docker/tasks/start_services.yml
new file mode 100644 (file)
index 0000000..320b816
--- /dev/null
@@ -0,0 +1,10 @@
+---
+- name: Reload systemd
+  systemd:
+    daemon_reload: yes
+
+- name: Enable and start services
+  service:
+    name: "{{ docker.service_name }}"
+    state: restarted
+    enabled: true
diff --git a/ansible/roles/ceph-grafana/defaults/main.yml b/ansible/roles/ceph-grafana/defaults/main.yml
new file mode 100644 (file)
index 0000000..829ff8e
--- /dev/null
@@ -0,0 +1,29 @@
+---
+defaults:
+  grafana:
+    container_image: "grafana/grafana:latest"
+    container_cpu_period: 100000
+    container_cpu_cores: 2
+    # container_memory is in GB
+    container_memory: 4
+    uid: 472
+    datasource: Dashboard
+    dashboards_path: "/etc/grafana/dashboards/ceph-dashboard"
+    plugins:
+      - vonage-status-panel
+      - grafana-piechart-panel
+devel_packages:
+  yum:
+    # unzip is needed to extract the Vonage plugin
+    - unzip
+    - grafana
+    # for dashUpdater.py
+    - PyYAML
+    - python-requests
+  apt:
+    # unzip is needed to extract the Vonage plugin
+    - unzip
+    - grafana
+    # for dashUpdater.py
+    - python-yaml
+    - python-requests
diff --git a/ansible/roles/ceph-grafana/files/dashboards b/ansible/roles/ceph-grafana/files/dashboards
new file mode 120000 (symlink)
index 0000000..9791cdc
--- /dev/null
@@ -0,0 +1 @@
+../../../../dashboards
\ No newline at end of file
diff --git a/ansible/roles/ceph-grafana/files/grafana-server.service b/ansible/roles/ceph-grafana/files/grafana-server.service
new file mode 100644 (file)
index 0000000..fab8f51
--- /dev/null
@@ -0,0 +1,17 @@
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=grafana-server
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach grafana-server
+ExecStop=-/usr/bin/docker stop grafana-server
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ansible/roles/ceph-grafana/files/grafana.list b/ansible/roles/ceph-grafana/files/grafana.list
new file mode 100644 (file)
index 0000000..886da8d
--- /dev/null
@@ -0,0 +1 @@
+deb https://packagecloud.io/grafana/stable/debian/ jessie main
diff --git a/ansible/roles/ceph-grafana/handlers/main.yml b/ansible/roles/ceph-grafana/handlers/main.yml
new file mode 100644 (file)
index 0000000..cb5200f
--- /dev/null
@@ -0,0 +1,8 @@
+---
+- name: Enable service
+  # We use the systemd module here so we can use the daemon_reload feature,
+  # since we're shipping the .service file ourselves
+  systemd:
+    name: grafana-server
+    daemon_reload: true
+    enabled: true
diff --git a/ansible/roles/ceph-grafana/meta/main.yml b/ansible/roles/ceph-grafana/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-grafana/tasks/configure_firewall.yml b/ansible/roles/ceph-grafana/tasks/configure_firewall.yml
new file mode 100644 (file)
index 0000000..f01b76c
--- /dev/null
@@ -0,0 +1,17 @@
+---
+- name: Check firewalld status
+  shell: "systemctl show firewalld | grep UnitFileState"
+  register: firewalld_status
+  failed_when: false
+  changed_when: false
+
+- name: Open ports for Grafana
+  firewalld:
+    port: "{{ item }}"
+    zone: "{{ firewalld_zone }}"
+    state: enabled
+    immediate: true
+    permanent: true
+  with_items:
+    - 3000/tcp
+  when: "'enabled' in firewalld_status.stdout"
diff --git a/ansible/roles/ceph-grafana/tasks/configure_grafana.yml b/ansible/roles/ceph-grafana/tasks/configure_grafana.yml
new file mode 100644 (file)
index 0000000..2aab67a
--- /dev/null
@@ -0,0 +1,96 @@
+---
+- name: Make sure grafana is down
+  service:
+    name: grafana-server
+    state: stopped
+
+- name: Wait for grafana to be stopped
+  wait_for:
+    port: 3000
+    state: stopped
+
+- name: Make sure grafana configuration directories exist
+  file:
+    path: "{{ item }}"
+    state: directory
+    recurse: yes
+  with_items:
+    - "/etc/grafana/provisioning/datasources"
+    - "/etc/grafana/provisioning/dashboards"
+
+- name: Write grafana.ini
+  template:
+    src: grafana.ini
+    dest: /etc/grafana/grafana.ini
+    mode: 0640
+
+- name: Write datasources provisioning config file
+  template:
+    src: datasources-ceph-dashboard.yml
+    dest: /etc/grafana/provisioning/datasources/ceph-dashboard.yml
+    mode: 0640
+
+- name: Write dashboards provisioning config file
+  template:
+    src: dashboards-ceph-dashboard.yml
+    dest: /etc/grafana/provisioning/dashboards/ceph-dashboard.yml
+    mode: 0640
+
+- name: Copy grafana SSL certificate file
+  copy:
+    src: "{{ grafana.crt }}"
+    dest: "/etc/grafana/ceph-dashboard.crt"
+    mode: 0640
+  when:
+    - grafana.crt
+    - protocol == "https"
+
+- name: Copy grafana SSL certificate key
+  copy:
+    src: "{{ grafana.key }}"
+    dest: "/etc/grafana/ceph-dashboard.key"
+    mode: 0640
+  when:
+    - grafana.key
+    - protocol == "https"
+
+- name: Generate a Self Signed OpenSSL certificate for dashboard
+  shell: |
+    test -f /etc/grafana/ceph-dashboard.key -a -f /etc/grafana/ceph-dashboard.crt || \
+    openssl req -new -nodes -x509 -subj '/O=IT/CN=ceph-grafana' -days 3650 -keyout /etc/grafana/ceph-dashboard.key -out /etc/grafana/ceph-dashboard.crt -extensions v3_ca
+  when:
+    - protocol == "https"
+    - not grafana.key or not grafana.crt
+
+- name: Set owner/group on /etc/grafana
+  file:
+    path: /etc/grafana
+    state: directory
+    # This is the UID used by the grafana container
+    owner: "{{ grafana.uid }}"
+    # This group is used by the grafana rpm
+    group: "grafana"
+    recurse: true
+
+- include: grafana_plugins.yml
+  when:
+    - devel_mode
+    - not containerized
+
+# TODO!!! Figure out how to distribute these !!!TODO
+- name: Store the dashboard files
+  copy:
+    src: "files/dashboards/"
+    dest: "{{ grafana.dashboards_path }}"
+    force: "{{ devel_mode }}"
+    follow: true
+
+- name: Enable and start grafana
+  service:
+    name: grafana-server
+    state: restarted
+    enabled: true
+
+- name: Wait for grafana to start
+  wait_for:
+    port: 3000
diff --git a/ansible/roles/ceph-grafana/tasks/grafana_plugins.yml b/ansible/roles/ceph-grafana/tasks/grafana_plugins.yml
new file mode 100644 (file)
index 0000000..e8029c7
--- /dev/null
@@ -0,0 +1,24 @@
+---
+- name: Create Grafana plugins directory
+  file:
+    name: /var/lib/grafana/plugins
+    state: directory
+
+- name: Install Grafana plugins
+  command: "grafana-cli plugins install {{ item }}"
+  with_items: "{{ grafana.plugins }}"
+
+- name: Update Grafana plugins
+  command: "grafana-cli plugins update {{ item }}"
+  with_items: "{{ grafana.plugins }}"
+
+- name: Update status-panel for readability within the 'light' theme (older versions)
+  command: "sed -i.bak -e 's/green/rgb(1,167,1)/g' /var/lib/grafana/plugins/vonage-status-panel/dist/css/status_panel.css"
+
+- name: Update status-panel for readability within the 'light' theme (newer versions)
+  replace:
+    path: "/var/lib/grafana/plugins/vonage-status-panel/dist/status_ctrl.js"
+    regexp:  "ok: 'rgba\\(50, 128, 45, 0\\.9\\)',"
+    replace: "ok: 'rgb(1,167,1)',"
+    backup: no
+  failed_when: false
diff --git a/ansible/roles/ceph-grafana/tasks/install_packages.yml b/ansible/roles/ceph-grafana/tasks/install_packages.yml
new file mode 100644 (file)
index 0000000..eeea168
--- /dev/null
@@ -0,0 +1,46 @@
+---
+- name: Add Grafana repo
+  template:
+    src: grafana.repo
+    dest: /etc/yum.repos.d/grafana.repo
+  when:
+    - ansible_pkg_mgr == 'yum'
+    - devel_mode
+
+- name: Add grafana repo
+  copy:
+    src: files/grafana.list
+    dest: /etc/apt/sources.list.d/grafana.list
+  when:
+    - ansible_pkg_mgr == "apt"
+    - devel_mode
+
+- name: Add packagecloud GPG key to apt
+  apt_key:
+    # This is the key used by the grafana repo
+    url: https://packagecloud.io/gpg.key
+    id: D59097AB
+    state: present
+  when:
+    - ansible_pkg_mgr == "apt"
+    - devel_mode
+
+- name: Update apt cache
+  apt:
+    update_cache: true
+  when:
+    - ansible_pkg_mgr == 'apt'
+    - devel_mode
+
+- name: Install packages
+  package:
+    name: "{{ item }}"
+    state: latest
+  with_items: "{{ devel_packages[ansible_pkg_mgr] }}"
+  when: devel_mode
+
+- name: Install cephmetrics
+  package:
+    name: cephmetrics
+    state: latest
+  when: not devel_mode
diff --git a/ansible/roles/ceph-grafana/tasks/main.yml b/ansible/roles/ceph-grafana/tasks/main.yml
new file mode 100644 (file)
index 0000000..806d019
--- /dev/null
@@ -0,0 +1,14 @@
+---
+- include: merge_vars.yml
+
+- include: setup_container.yml
+  when:
+    - containerized
+
+- include: install_packages.yml
+  when:
+    - not containerized
+
+- include: configure_firewall.yml
+
+- include: configure_grafana.yml
diff --git a/ansible/roles/ceph-grafana/tasks/merge_vars.yml b/ansible/roles/ceph-grafana/tasks/merge_vars.yml
new file mode 120000 (symlink)
index 0000000..299adff
--- /dev/null
@@ -0,0 +1 @@
+../../ceph-defaults/tasks/merge_vars.yml
\ No newline at end of file
diff --git a/ansible/roles/ceph-grafana/tasks/setup_container.yml b/ansible/roles/ceph-grafana/tasks/setup_container.yml
new file mode 100644 (file)
index 0000000..e217610
--- /dev/null
@@ -0,0 +1,58 @@
+---
+- name: Include ceph-docker
+  include_role:
+    name: ceph-docker
+    allow_duplicates: false
+  when: containerized
+
+- name: Create grafana user
+  user:
+    name: grafana
+    shell: '/bin/false'
+    createhome: false
+    system: true
+
+- name: Create /etc/grafana and /var/lib/grafana
+  file:
+    path: "{{ item }}"
+    state: directory
+    owner: "{{ grafana.uid }}"
+    recurse: true
+  with_items:
+    - /etc/grafana
+    - /var/lib/grafana
+
+- name: Create docker container
+  docker_container:
+    name: grafana-server
+    image: "{{ grafana.container_image }}"
+    state: present
+    # restart to allow updates
+    restart: true
+    restart_policy: no
+    published_ports: '3000:3000'
+    detach: true
+    volumes:
+      - "/etc/grafana:/etc/grafana:Z"
+      - "/var/lib/grafana:/var/lib/grafana:Z"
+    networks:
+      - name: "{{ docker.network_name }}"
+    keep_volumes: true
+    pull: true
+    cpu_period: "{{ grafana.container_cpu_period }}"
+    # As of ansible-2.5.2, this module doesn't support the equivalent of the
+    # --cpus flag, so we must use period/quota for now
+    cpu_quota: "{{ grafana.container_cpu_period * grafana.container_cpu_cores }}"
+    memory: "{{ grafana.container_memory }}GB"
+    memory_swap: "{{ grafana.container_memory * 2 }}GB"
+    env:
+      GF_INSTALL_PLUGINS: "{{ grafana.plugins|join(',') }}"
+
+- name: Ship systemd service
+  copy:
+    src: grafana-server.service
+    dest: "/etc/systemd/system/"
+    owner: root
+    group: root
+    mode: 0644
+  notify: Enable service
diff --git a/ansible/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml b/ansible/roles/ceph-grafana/templates/dashboards-ceph-dashboard.yml
new file mode 100644 (file)
index 0000000..990a60f
--- /dev/null
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+- name: 'Ceph Dashboard'
+  orgId: 1
+  folder: 'ceph-dashboard'
+  type: file
+  disableDeletion: false
+  updateIntervalSeconds: 3
+  editable: false
+  options:
+    path: '{{ grafana.dashboards_path }}'
diff --git a/ansible/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml b/ansible/roles/ceph-grafana/templates/datasources-ceph-dashboard.yml
new file mode 100644 (file)
index 0000000..8bb83f1
--- /dev/null
@@ -0,0 +1,26 @@
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+  - name: '{{ grafana.datasource }}'
+    orgId: 1
+
+# list of datasources to insert/update depending
+# what's available in the database
+datasources:
+  # <string, required> name of the datasource. Required
+- name: '{{ grafana.datasource }}'
+  # <string, required> datasource type. Required
+  type: 'prometheus'
+  # <string, required> access mode. proxy or direct (Server or Browser in the UI). Required
+  access: 'proxy'
+  # <int> org id. will default to orgId 1 if not specified
+  orgId: 1
+  # <string> url
+  url: 'http://prometheus:9090'
+  # <bool> enable/disable basic auth
+  basicAuth: false
+  # <bool> mark as default datasource. Max one per org
+  isDefault: true
+  # <bool> allow users to edit datasources from the UI.
+  editable: false
diff --git a/ansible/roles/ceph-grafana/templates/grafana.ini b/ansible/roles/ceph-grafana/templates/grafana.ini
new file mode 100644 (file)
index 0000000..9b6a51f
--- /dev/null
@@ -0,0 +1,26 @@
+# [server]
+# root_url = %(protocol)s://%(domain)s:%(http_port)s/api/grafana/proxy
+
+[users]
+default_theme = light
+
+#################################### Anonymous Auth ##########################
+[auth.anonymous]
+# enable anonymous access
+enabled = true
+
+# specify organization name that should be used for unauthenticated users
+org_name = Main Org.
+
+# specify role for unauthenticated users
+org_role = Viewer
+
+[server]
+cert_file = /etc/grafana/ceph-dashboard.crt
+cert_key = /etc/grafana/ceph-dashboard.key
+domain = {{ ansible_fqdn }}
+protocol = {{ protocol }}
+
+[security]
+admin_user = {{ grafana.admin_user }}
+admin_password = {{ grafana.admin_password }}
diff --git a/ansible/roles/ceph-grafana/templates/grafana.repo b/ansible/roles/ceph-grafana/templates/grafana.repo
new file mode 100644 (file)
index 0000000..1ba7fb6
--- /dev/null
@@ -0,0 +1,9 @@
+[grafana]
+name=grafana
+baseurl=https://packagecloud.io/grafana/stable/el/{{ ansible_distribution_major_version }}/$basearch
+repo_gpgcheck=1
+enabled=1
+gpgcheck=1
+gpgkey=https://packagecloud.io/gpg.key https://grafanarel.s3.amazonaws.com/RPM-GPG-KEY-grafana
+sslverify=1
+sslcacert=/etc/pki/tls/certs/ca-bundle.crt
diff --git a/ansible/roles/ceph-mgr/defaults/main.yml b/ansible/roles/ceph-mgr/defaults/main.yml
new file mode 100644 (file)
index 0000000..dfdd45e
--- /dev/null
@@ -0,0 +1,2 @@
+---
+defaults: {}
diff --git a/ansible/roles/ceph-mgr/meta/main.yml b/ansible/roles/ceph-mgr/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-mgr/tasks/main.yml b/ansible/roles/ceph-mgr/tasks/main.yml
new file mode 100644 (file)
index 0000000..e555bd3
--- /dev/null
@@ -0,0 +1,26 @@
+---
+- include: merge_vars.yml
+
+- name: Check to see if the mgr is containerized
+  command: "docker inspect {{ item }}"
+  with_items:
+    - "ceph-mgr@{{ ansible_hostname }}"
+    - "ceph-mgr-{{ ansible_hostname }}"
+  register: mgr_container
+  failed_when: false
+
+- name: Choose the correct container name
+  set_fact:
+    container_name: "{% for res in mgr_container.results if res.rc == 0 %}{{ res.item }}{% endfor %}"
+    mgr_prefix: "" # Set the default value for mgr_prefix
+
+- name: Prefix the mgr command with a docker command
+  set_fact:
+    mgr_prefix: "docker exec {{ container_name }}"
+  when: container_name != ""
+
+- name: Enable mgr prometheus module
+  command: "{{ mgr_prefix }} ceph mgr module enable prometheus"
+
+- name: Enable mgr dashboard module
+  command: "{{ mgr_prefix }} ceph mgr module enable dashboard"
diff --git a/ansible/roles/ceph-mgr/tasks/merge_vars.yml b/ansible/roles/ceph-mgr/tasks/merge_vars.yml
new file mode 120000 (symlink)
index 0000000..299adff
--- /dev/null
@@ -0,0 +1 @@
+../../ceph-defaults/tasks/merge_vars.yml
\ No newline at end of file
diff --git a/ansible/roles/ceph-node-exporter/defaults/main.yml b/ansible/roles/ceph-node-exporter/defaults/main.yml
new file mode 100644 (file)
index 0000000..59b8b77
--- /dev/null
@@ -0,0 +1,10 @@
+---
+defaults:
+  node_exporter:
+    arch_map:
+      x86_64: amd64
+      i386: '386'
+    packages:
+      - prometheus-node-exporter
+    service_name: node_exporter
+    container_image: prom/node-exporter:latest
diff --git a/ansible/roles/ceph-node-exporter/files/node_exporter.service b/ansible/roles/ceph-node-exporter/files/node_exporter.service
new file mode 100644 (file)
index 0000000..ebf57b1
--- /dev/null
@@ -0,0 +1,20 @@
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=Node Exporter
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach node-exporter
+# Make sure the cfg80211 is loaded before running the container, the node
+# exporter needs this module loaded to test for presence of wi-fi devices
+ExecStartPre=/usr/sbin/modprobe cfg80211
+ExecStop=-/usr/bin/docker stop node-exporter
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ansible/roles/ceph-node-exporter/handlers/main.yml b/ansible/roles/ceph-node-exporter/handlers/main.yml
new file mode 100644 (file)
index 0000000..c983655
--- /dev/null
@@ -0,0 +1,9 @@
+---
+- name: Restart service
+  # We use the systemd module here so we can use the daemon_reload feature,
+  # since we're shipping the .service file ourselves
+  systemd:
+    name: "{{ node_exporter.service_name }}"
+    daemon_reload: true
+    enabled: true
+    state: restarted
diff --git a/ansible/roles/ceph-node-exporter/meta/main.yml b/ansible/roles/ceph-node-exporter/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-node-exporter/tasks/install_packages.yml b/ansible/roles/ceph-node-exporter/tasks/install_packages.yml
new file mode 100644 (file)
index 0000000..7c84834
--- /dev/null
@@ -0,0 +1,6 @@
+---
+- name: Install node_exporter
+  package:
+    name: "{{ node.exporter.packages }}"
+    state: latest
+  notify: Restart service
diff --git a/ansible/roles/ceph-node-exporter/tasks/main.yml b/ansible/roles/ceph-node-exporter/tasks/main.yml
new file mode 100644 (file)
index 0000000..f64d1b9
--- /dev/null
@@ -0,0 +1,25 @@
+---
+- include: merge_vars.yml
+
+- include: setup_container.yml
+  when: containerized
+
+- include: install_packages.yml
+  when: not containerized
+
+- name: Check firewalld status
+  shell: "systemctl show firewalld | grep UnitFileState"
+  register: firewalld_status
+  failed_when: false
+  changed_when: false
+
+- name: Open ports for node_exporter
+  firewalld:
+    port: "{{ item }}"
+    zone: "{{ firewalld_zone }}"
+    state: enabled
+    immediate: true
+    permanent: true
+  with_items:
+    - 9100/tcp
+  when: "'enabled' in firewalld_status.stdout"
diff --git a/ansible/roles/ceph-node-exporter/tasks/merge_vars.yml b/ansible/roles/ceph-node-exporter/tasks/merge_vars.yml
new file mode 100644 (file)
index 0000000..f8dbcd0
--- /dev/null
@@ -0,0 +1,5 @@
+---
+- name: Combine default settings and user-defined variables
+  set_fact: {"{{ item }}": "{% if vars[item] is not defined %}{{ defaults[item] }}{% elif vars[item] is mapping %}{{ defaults[item]|combine(vars[item]|default({})) }}{% else %}{{ vars[item] }}{% endif %}"}
+  with_items: "{{ defaults.keys() }}"
+  no_log: true
diff --git a/ansible/roles/ceph-node-exporter/tasks/setup_container.yml b/ansible/roles/ceph-node-exporter/tasks/setup_container.yml
new file mode 100644 (file)
index 0000000..025624d
--- /dev/null
@@ -0,0 +1,32 @@
+---
+- name: Include ceph-docker
+  include_role:
+    name: ceph-docker
+    allow_duplicates: false
+
+- name: Start docker container
+  docker_container:
+    name: node-exporter
+    image: "{{ node_exporter.container_image }}"
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--no-collector.timex'
+    restart_policy: no
+    detach: true
+    volumes:
+      - '/proc:/host/proc:ro'
+      - '/sys:/host/sys:ro'
+    network_mode: host
+    keep_volumes: true
+    pull: true
+  notify: Restart service
+
+- name: Ship systemd service
+  copy:
+    src: node_exporter.service
+    dest: "/etc/systemd/system/"
+    owner: root
+    group: root
+    mode: 0644
+  notify: Restart service
diff --git a/ansible/roles/ceph-node-exporter/templates/sysconfig b/ansible/roles/ceph-node-exporter/templates/sysconfig
new file mode 100644 (file)
index 0000000..0b7a839
--- /dev/null
@@ -0,0 +1 @@
+{{ defaults.node_exporter.sysconfig|default('') }}
diff --git a/ansible/roles/ceph-prometheus/defaults/main.yml b/ansible/roles/ceph-prometheus/defaults/main.yml
new file mode 100644 (file)
index 0000000..b45a5e4
--- /dev/null
@@ -0,0 +1,10 @@
+---
+defaults:
+  prometheus:
+    container_image: prom/prometheus:latest
+    container_cpu_period: 100000
+    container_cpu_cores: 2
+    # container_memory is in GB
+    container_memory: 4
+    data_dir: /var/lib/cephmetrics
+    user_id: '65534'  # This is the UID used by the prom/prometheus docker image
diff --git a/ansible/roles/ceph-prometheus/files/prometheus.service b/ansible/roles/ceph-prometheus/files/prometheus.service
new file mode 100644 (file)
index 0000000..7b6c8ef
--- /dev/null
@@ -0,0 +1,17 @@
+# This file is managed by ansible, don't make changes here - they will be
+# overwritten.
+[Unit]
+Description=prometheus
+After=docker.service
+
+[Service]
+EnvironmentFile=-/etc/environment
+ExecStart=/usr/bin/docker start --attach prometheus
+ExecStop=-/usr/bin/docker stop prometheus
+Restart=always
+RestartSec=10s
+TimeoutStartSec=120
+TimeoutStopSec=15
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ansible/roles/ceph-prometheus/handlers/main.yml b/ansible/roles/ceph-prometheus/handlers/main.yml
new file mode 100644 (file)
index 0000000..421bea5
--- /dev/null
@@ -0,0 +1,9 @@
+---
+- name: Service handler
+  # We use the systemd module here so we can use the daemon_reload feature,
+  # since we're shipping the .service file ourselves
+  systemd:
+    name: prometheus
+    daemon_reload: true
+    enabled: true
+    state: restarted
diff --git a/ansible/roles/ceph-prometheus/meta/main.yml b/ansible/roles/ceph-prometheus/meta/main.yml
new file mode 100644 (file)
index 0000000..e97ea33
--- /dev/null
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: ceph-defaults
diff --git a/ansible/roles/ceph-prometheus/tasks/install_packages.yml b/ansible/roles/ceph-prometheus/tasks/install_packages.yml
new file mode 100644 (file)
index 0000000..a4f9db2
--- /dev/null
@@ -0,0 +1,6 @@
+---
+- name: Install prometheus
+  package:
+    name: prometheus
+    state: latest
+  notify: Service handler
diff --git a/ansible/roles/ceph-prometheus/tasks/main.yml b/ansible/roles/ceph-prometheus/tasks/main.yml
new file mode 100644 (file)
index 0000000..813d398
--- /dev/null
@@ -0,0 +1,21 @@
+---
+- include: merge_vars.yml
+
+- name: Create prometheus data directory
+  file:
+    path: "{{ prometheus.data_dir }}"
+    state: directory
+    owner: "{{ prometheus.user_id }}"
+
+- name: Write config file
+  template:
+    src: prometheus.yml
+    dest: "{{ prometheus.data_dir }}/"
+    owner: "{{ prometheus.user_id }}"
+  notify: Service handler
+
+- include: setup_container.yml
+  when: containerized
+
+- import_tasks: install_packages.yml
+  when: not containerized
diff --git a/ansible/roles/ceph-prometheus/tasks/merge_vars.yml b/ansible/roles/ceph-prometheus/tasks/merge_vars.yml
new file mode 120000 (symlink)
index 0000000..299adff
--- /dev/null
@@ -0,0 +1 @@
+../../ceph-defaults/tasks/merge_vars.yml
\ No newline at end of file
diff --git a/ansible/roles/ceph-prometheus/tasks/setup_container.yml b/ansible/roles/ceph-prometheus/tasks/setup_container.yml
new file mode 100644 (file)
index 0000000..3a5b195
--- /dev/null
@@ -0,0 +1,40 @@
+---
+- name: Include ceph-docker
+  include_role:
+    name: ceph-docker
+    allow_duplicates: false
+  when: containerized
+
+- name: Start docker container
+  docker_container:
+    name: prometheus
+    image: "{{ prometheus.container_image }}"
+    command: "--config.file=/prometheus/prometheus.yml"
+    restart_policy: no
+    published_ports: '9090:9090'
+    detach: true
+    volumes:
+      - "{{ prometheus.data_dir }}:/prometheus:Z"
+    networks:
+      - name: "{{ docker.network_name }}"
+    user: "{{ prometheus.user_id }}"
+    keep_volumes: true
+    pull: true
+    cpu_period: "{{ prometheus.container_cpu_period }}"
+    # As of ansible-2.5.2, this module doesn't support the equivalent of the
+    # --cpus flag, so we must use period/quota for now
+    cpu_quota: "{{ prometheus.container_cpu_period * prometheus.container_cpu_cores }}"
+    #memory: 0
+    #memory_swap: 0
+    memory: "{{ prometheus.container_memory }}GB"
+    memory_swap: "{{ prometheus.container_memory * 2 }}GB"
+  notify: Service handler
+
+- name: Ship systemd service
+  copy:
+    src: prometheus.service
+    dest: "/etc/systemd/system/"
+    owner: root
+    group: root
+    mode: 0644
+  notify: Service handler
diff --git a/ansible/roles/ceph-prometheus/templates/prometheus.yml b/ansible/roles/ceph-prometheus/templates/prometheus.yml
new file mode 100644 (file)
index 0000000..038c789
--- /dev/null
@@ -0,0 +1,39 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+  - job_name: 'ceph'
+    honor_labels: true
+    static_configs:
+{% for host in groups['mgrs'] %}
+      - targets: ['{{ host }}:9283']
+        labels:
+          instance: 'ceph_cluster'
+{% endfor %}
+  - job_name: 'node'
+    static_configs:
+{% for host in (groups['all'] | difference(groups['ceph-grafana'])) %}
+      - targets: ['{{ host }}:9100']
+        labels:
+          instance: "{{ hostvars[host]['ansible_nodename'] }}"
+{% endfor %}
+  - job_name: 'grafana'
+    static_configs:
+{% for host in groups['ceph-grafana'] %}
+      - targets: ['{{ host }}:9100']
+        labels:
+          instance: "{{ hostvars[host]['ansible_nodename'] }}"
+{% endfor %}
+{% if 'iscsis' in groups %}
+  - job_name: 'iscsi-gws'
+    static_configs:
+{% for host in groups['iscsis'] %}
+      - targets: ['{{ host }}:9287']
+        labels:
+          instance: "{{ hostvars[host]['ansible_nodename'] }}"
+{% endfor %}
+{% endif %}
diff --git a/build_srpm b/build_srpm
new file mode 100755 (executable)
index 0000000..5c1665a
--- /dev/null
@@ -0,0 +1,52 @@
+#! /usr/bin/bash
+set -ex
+
+## Install any setup-time deps (to make dist package)
+
+# We need this to get the major version from lsb_release
+#sudo yum install -y redhat-lsb-core mock git wget
+
+# Run the install-deps.sh upstream script if it exists
+if [ -x install-deps.sh ]; then
+    echo "Ensuring dependencies are installed"
+    sudo ./install-deps.sh
+fi
+
+# Clean dist dir
+rm -rf dist/ && mkdir dist
+rm -f *.any.src.rpm
+
+## Get some basic information about the system and the repository
+DESCRIBE="$(git describe --tags 2>/dev/null | cut -b 2-)"
+test -z "$DESCRIBE" && DESCRIBE="0.1-$(git rev-list --count HEAD)-g$(git rev-parse --short HEAD)"
+VERSION=1.0
+REVISION="$(echo $DESCRIBE | cut -s -d - -f 2-)"
+test -z "$REVISION" && REVISION=0
+RPM_RELEASE=1
+
+
+## Build the source tarball
+echo "Building source distribution"
+git archive --format=zip --prefix=dashboard-ansible-${VERSION}/ HEAD > dist/dashboard-ansible-${VERSION}.zip
+#wget https://grafana.com/api/plugins/vonage-status-panel/versions/1.0.4/download -O dist/vonage-status-panel-1.0.4.zip
+#wget https://grafana.com/api/plugins/grafana-piechart-panel/versions/1.1.5/download -O dist/grafana-piechart-panel-1.1.5.zip
+
+
+## Prepare the spec file for build
+sed -e "s/@VERSION@/${VERSION}/g" -e "s/@RELEASE@/${RPM_RELEASE}/g" < dashboard-ansible.spec.in > dist/dashboard-ansible.spec
+
+
+## Create the source rpm
+echo "Building SRPM"
+rpmbuild \
+    --define "_sourcedir ./dist" \
+    --define "_specdir ." \
+    --define "_builddir ." \
+    --define "_srcrpmdir ." \
+    --define "_rpmdir ." \
+    --define "dist .any" \
+    --define "fedora 21" \
+    --define "rhel 7" \
+    --nodeps -bs dist/dashboard-ansible.spec
+SRPM=$(readlink -f *.src.rpm)
+echo "SRPM='$SRPM'"
diff --git a/dashboard-ansible.spec.in b/dashboard-ansible.spec.in
new file mode 100644 (file)
index 0000000..e38eeaa
--- /dev/null
@@ -0,0 +1,65 @@
+%define debug_package %{nil}
+
+%{!?_selinux_policy_version: %global _selinux_policy_version %(sed -e 's,.*selinux-policy-\\([^/]*\\)/.*,\\1,' /usr/share/selinux/devel/policyhelp 2>/dev/null)}
+
+Name:          dashboard-ansible
+Version:       @VERSION@
+Release:       @RELEASE@%{?dist}
+Summary:       Monitoring service for Ceph clusters deployment tool
+
+License:       GPLv3
+URL:           https://github.com/ceph/cephmetrics
+Source0:       dashboard-ansible-@VERSION@.zip
+BuildArch:     noarch
+
+Requires:      ceph-ansible
+Requires:      PyYAML
+Requires:      python-requests
+Requires:      python-netaddr
+Obsoletes:     cephmetrics-grafana-plugins < %{version}-%{release}
+Obsoletes:     cephmetrics-collectors < %{version}-%{release}
+Obsoletes:     cephmetrics < %{version}-%{release}
+Obsoletes:     cephmetrics-ansible < %{version}-%{release}
+
+%description
+The monitoring service with web frontend for Ceph storage clusters providing several statistical data graphed by grafana. This package contains a set of ansible playbooks to deploy a cephmetrics server.
+
+%prep
+%setup -q
+# Disable devel_mode in the rpms
+patch -p1 < patches/0001-ansible-Disable-devel_mode.patch
+
+
+%build
+# Change the devel_mode defaults
+sed -i -e 's/devel_mode: true/devel_mode: false/' ansible/roles/*/defaults/main.yml
+
+# Change the prometheus container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus:v3.10|' ansible/roles/ceph-prometheus/defaults/main.yml
+#sed -i -e 's|version: .*$|version: v3.9|' ansible/roles/ceph-prometheus/defaults/main.yml
+
+# Change the grafana container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/rhceph/rhceph-3-dashboard-rhel7:3|' ansible/roles/ceph-dashboard/defaults/main.yml
+#sed -i -e 's|version: .*$|version: 3|' ansible/roles/ceph-grafana/defaults/main.yml
+
+# Change the node_exporter container location/version
+sed -i -e 's|container_image: .*$|container_image: registry.access.redhat.com/openshift3/prometheus-node-exporter:v3.10|' ansible/roles/ceph-node-exporter/defaults/main.yml
+
+# Change the service_name for node_exporter
+#sed -i -e 's|service_name: .*|service_name: prometheus-node-exporter|' ansible/roles/ceph-node-exporter/defaults/main.yml
+
+
+%install
+# Install ansible playbooks
+install -d %{buildroot}%{_datadir}
+cp -L -r ansible %{buildroot}%{_datadir}/dashboard-ansible
+#rm -f %{buildroot}%{_datadir}/dashboard-ansible/roles/ceph-grafana/files/dashboards
+exit 0
+
+
+%files
+%{_datadir}/dashboard-ansible
+%doc LICENSE
+%doc README
+
+%changelog
diff --git a/dashboards/README b/dashboards/README
new file mode 100644 (file)
index 0000000..3803cd7
--- /dev/null
@@ -0,0 +1,28 @@
+Context
+These dashboards should be enough to get started on the integration. It's not a complete set, so more will be added in the next week.
+
+Bare in mind that the osd device details dashboard needs node_exporter active - all the other dashboards pick data out of ceph-mgr based metrics.
+
+
+The cephfs dashboard only has 2 panels currently. The counter available are
+a little light at the moment. Patrick/Venky have been addressing this with
+https://bugzilla.redhat.com/show_bug.cgi?id=1618523
+cephfs-overview.json
+
+Host Information
+host-details.json combines generic server metrics that show cpu/memory/network stats (including network errors/drops),
+with disk level stats for OSD hosts. OSD charts show the physical device name together with it's corresponding osd id for correlation.
+
+Ceph Pools
+two dashboards. Overview gives the high level combined view, pool-detail needs a pool_name variable passed to it (currently uses a templating var which is visible)
+pool-overview.json
+pool-detail.json
+
+OSD Device Details. This dashboard needs some further work. It currently shows
+OSD level stats with physical device stats but leaves out some of the counters
+that cephmetrics provides for trouble shooting.
+osd-device-details.json
+
+Object gateway dashboards, again split into overview and detail. The detail dashboard needs the relevant ceph-deamon name for the rgw instance.
+radosgw-overview.json
+radosgw-detail.json
diff --git a/dashboards/ceph-cluster.json b/dashboards/ceph-cluster.json
new file mode 100644 (file)
index 0000000..0408886
--- /dev/null
@@ -0,0 +1,1250 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "heatmap",
+      "name": "Heatmap",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "singlestat",
+      "name": "Singlestat",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "vonage-status-panel",
+      "name": "Status Panel",
+      "version": "1.0.8"
+    }
+  ],
+  "annotations": {
+    "list": []
+  },
+  "description": "Ceph cluster overview",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1525415495309,
+  "links": [],
+  "panels": [
+    {
+      "cacheTimeout": null,
+      "colorBackground": true,
+      "colorValue": false,
+      "colors": [
+        "rgba(50, 128, 45, 0.9)",
+        "rgba(237, 129, 40, 0.9)",
+        "rgb(255, 0, 0)"
+      ],
+      "datasource": "$datasource",
+      "editable": true,
+      "error": false,
+      "format": "none",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 2,
+        "x": 0,
+        "y": 0
+      },
+      "hideTimeOverride": true,
+      "id": 21,
+      "interval": "1m",
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "span": 2,
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "ceph_health_status{instance=~'$instance'}",
+          "format": "time_series",
+          "interval": "$interval",
+          "intervalFactor": 1,
+          "refId": "A",
+          "step": 60
+        }
+      ],
+      "thresholds": "1,2",
+      "timeFrom": "1m",
+      "title": "Health Status",
+      "transparent": false,
+      "type": "singlestat",
+      "valueFontSize": "50%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "OK",
+          "value": "0"
+        },
+        {
+          "op": "=",
+          "text": "WARN",
+          "value": "1"
+        },
+        {
+          "op": "=",
+          "text": "ERR",
+          "value": "2"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "colorMode": "Panel",
+      "colors": {
+        "crit": "rgb(255, 0, 0)",
+        "disable": "rgba(128, 128, 128, 0.9)",
+        "ok": "rgba(50, 128, 45, 0.9)",
+        "warn": "rgba(237, 129, 40, 0.9)"
+      },
+      "cornerRadius": 0,
+      "datasource": "$datasource",
+      "displayName": "",
+      "flipCard": false,
+      "flipTime": 5,
+      "fontFormat": "Regular",
+      "gridPos": {
+        "h": 3,
+        "w": 2,
+        "x": 2,
+        "y": 0
+      },
+      "id": 43,
+      "isAutoScrollOnOverflow": false,
+      "isGrayOnNoData": false,
+      "isHideAlertsOnDisable": false,
+      "isIgnoreOKColors": false,
+      "links": [],
+      "targets": [
+        {
+          "aggregation": "Last",
+          "alias": "All",
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "count(ceph_osd_metadata{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "All",
+          "refId": "A",
+          "units": "none",
+          "valueHandler": "Number Threshold"
+        },
+        {
+          "aggregation": "Last",
+          "alias": "In",
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "sum(ceph_osds_in{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "In",
+          "refId": "B",
+          "units": "none",
+          "valueHandler": "Number Threshold"
+        },
+        {
+          "aggregation": "Last",
+          "alias": "Out",
+          "decimals": 2,
+          "displayAliasType": "Warning / Critical",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "sum(ceph_osd_in{instance=~\"$instance\"} == bool 0)",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "Out",
+          "refId": "C",
+          "units": "none",
+          "valueHandler": "Number Threshold",
+          "warn": 1
+        },
+        {
+          "aggregation": "Last",
+          "alias": "Up",
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "sum(ceph_osd_up{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Up",
+          "refId": "D",
+          "units": "none",
+          "valueHandler": "Number Threshold"
+        },
+        {
+          "aggregation": "Last",
+          "alias": "Down",
+          "crit": 2,
+          "decimals": 2,
+          "displayAliasType": "Warning / Critical",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "sum(ceph_osd_up{instance=~\"$instance\"} == bool 0)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Down",
+          "refId": "E",
+          "units": "none",
+          "valueHandler": "Number Threshold",
+          "warn": 1
+        }
+      ],
+      "title": "OSDs",
+      "type": "vonage-status-panel"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "format": "percentunit",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": true,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 4,
+        "x": 4,
+        "y": 0
+      },
+      "id": 47,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": true
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "sum(ceph_osd_stat_bytes_used{instance=~\"$instance\"})/sum(ceph_osd_stat_bytes{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Used",
+          "refId": "A"
+        }
+      ],
+      "thresholds": "70,80",
+      "title": "Capacity used",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 0,
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 8,
+        "y": 0
+      },
+      "id": 53,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "Active",
+          "color": "#508642",
+          "fill": 1,
+          "stack": "A"
+        },
+        {
+          "alias": "Total",
+          "color": "#f9e2d2"
+        },
+        {
+          "alias": "Degraded",
+          "color": "#eab839"
+        },
+        {
+          "alias": "Undersized",
+          "color": "#f9934e"
+        },
+        {
+          "alias": "Inconsistent",
+          "color": "#e24d42"
+        },
+        {
+          "alias": "Down",
+          "color": "#bf1b00"
+        },
+        {
+          "alias": "Inactive",
+          "color": "#bf1b00",
+          "fill": 4,
+          "linewidth": 0,
+          "stack": "A"
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "ceph_pg_total",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Total",
+          "refId": "A"
+        },
+        {
+          "expr": "ceph_pg_active",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Active",
+          "refId": "B"
+        },
+        {
+          "expr": "ceph_pg_total - ceph_pg_active",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Inactive",
+          "refId": "G"
+        },
+        {
+          "expr": "ceph_pg_undersized",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Undersized",
+          "refId": "F"
+        },
+        {
+          "expr": "ceph_pg_degraded",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Degraded",
+          "refId": "C"
+        },
+        {
+          "expr": "ceph_pg_inconsistent",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Inconsistent",
+          "refId": "D"
+        },
+        {
+          "expr": "ceph_pg_down",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Down",
+          "refId": "E"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "PG States",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 0,
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 16,
+        "y": 0
+      },
+      "id": 66,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "Avg Apply Latency",
+          "color": "#7eb26d"
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "quantile(0.95, ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Apply Latency P_95",
+          "refId": "A"
+        },
+        {
+          "expr": "quantile(0.95, ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Commit Latency P_95",
+          "refId": "B"
+        },
+        {
+          "expr": "avg(ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Avg Apply Latency",
+          "refId": "C"
+        },
+        {
+          "expr": "avg(ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Avg Commit Latency",
+          "refId": "D"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD Latencies",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "clusterName": "",
+      "colorMode": "Panel",
+      "colors": {
+        "crit": "rgba(245, 54, 54, 0.9)",
+        "disable": "rgba(128, 128, 128, 0.9)",
+        "ok": "rgba(50, 128, 45, 0.9)",
+        "warn": "rgba(237, 129, 40, 0.9)"
+      },
+      "cornerRadius": 1,
+      "datasource": "$datasource",
+      "displayName": "",
+      "flipCard": false,
+      "flipTime": 5,
+      "fontFormat": "Regular",
+      "gridPos": {
+        "h": 3,
+        "w": 2,
+        "x": 0,
+        "y": 3
+      },
+      "id": 41,
+      "isAutoScrollOnOverflow": false,
+      "isGrayOnNoData": false,
+      "isHideAlertsOnDisable": false,
+      "isIgnoreOKColors": false,
+      "links": [],
+      "targets": [
+        {
+          "aggregation": "Last",
+          "alias": "In Quorum",
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "In Quorum",
+          "refId": "A",
+          "units": "none",
+          "valueHandler": "Text Only"
+        },
+        {
+          "aggregation": "Last",
+          "alias": "Total",
+          "crit": 1,
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Total",
+          "refId": "B",
+          "units": "none",
+          "valueHandler": "Text Only",
+          "warn": 2
+        },
+        {
+          "aggregation": "Last",
+          "alias": "MONs out of Quorum",
+          "crit": 1.6,
+          "decimals": 2,
+          "displayAliasType": "Warning / Critical",
+          "displayType": "Annotation",
+          "displayValueWithAlias": "Never",
+          "expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"}) / sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "MONs out of Quorum",
+          "refId": "C",
+          "units": "none",
+          "valueHandler": "Number Threshold",
+          "warn": 1.1
+        }
+      ],
+      "title": "Monitors",
+      "type": "vonage-status-panel"
+    },
+    {
+      "colorMode": "Disabled",
+      "colors": {
+        "crit": "rgba(245, 54, 54, 0.9)",
+        "disable": "rgba(128, 128, 128, 0.9)",
+        "ok": "rgba(50, 128, 45, 0.9)",
+        "warn": "rgba(237, 129, 40, 0.9)"
+      },
+      "cornerRadius": 0,
+      "datasource": "$datasource",
+      "displayName": "",
+      "flipCard": false,
+      "flipTime": 5,
+      "fontFormat": "Regular",
+      "gridPos": {
+        "h": 3,
+        "w": 2,
+        "x": 2,
+        "y": 3
+      },
+      "id": 68,
+      "isAutoScrollOnOverflow": false,
+      "isGrayOnNoData": false,
+      "isHideAlertsOnDisable": false,
+      "isIgnoreOKColors": false,
+      "links": [],
+      "targets": [
+        {
+          "aggregation": "Last",
+          "alias": "Clients",
+          "decimals": 2,
+          "displayAliasType": "Always",
+          "displayType": "Regular",
+          "displayValueWithAlias": "When Alias Displayed",
+          "expr": "ceph_mds_server_handle_client_session{instance=~\"$instance\"}",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Clients",
+          "refId": "A",
+          "units": "none",
+          "valueHandler": "Number Threshold"
+        }
+      ],
+      "title": "Client connections",
+      "type": "vonage-status-panel"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 6
+      },
+      "id": 45,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 0.5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "Reads",
+          "transform": "negative-Y"
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(irate(ceph_osd_op_w_in_bytes{instance=~\"$instance\"}[1m]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Writes",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(irate(ceph_osd_op_r_out_bytes{instance=~\"$instance\"}[1m]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Reads",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Cluster I/O",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "Bps",
+          "label": "Read (-) / Write (+)",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 6
+      },
+      "id": 62,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(deriv(ceph_pool_bytes_used{instance=~\"$instance\"}[1m]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "In-/Egress",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "Bps",
+          "label": " Egress (-) / Ingress (+)",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "cards": {
+        "cardPadding": null,
+        "cardRound": 1
+      },
+      "color": {
+        "cardColor": "rgb(0, 254, 255)",
+        "colorScale": "sqrt",
+        "colorScheme": "interpolateBlues",
+        "exponent": 0.5,
+        "min": null,
+        "mode": "spectrum"
+      },
+      "dataFormat": "timeseries",
+      "datasource": "$datasource",
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 0,
+        "y": 15
+      },
+      "heatmap": {},
+      "highlightCards": true,
+      "id": 55,
+      "legend": {
+        "show": true
+      },
+      "links": [],
+      "span": 12,
+      "targets": [
+        {
+          "expr": "ceph_osd_stat_bytes_used{instance='$instance'} / ceph_osd_stat_bytes{instance='$instance'}",
+          "format": "time_series",
+          "interval": "1m",
+          "intervalFactor": 1,
+          "legendFormat": "Util (%)",
+          "refId": "A",
+          "step": 60
+        }
+      ],
+      "timeFrom": null,
+      "title": "OSD Capacity Utilization",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "xBucketNumber": null,
+      "xBucketSize": "",
+      "yAxis": {
+        "decimals": null,
+        "format": "percentunit",
+        "logBase": 1,
+        "max": null,
+        "min": null,
+        "show": true,
+        "splitFactor": null
+      },
+      "yBucketNumber": null,
+      "yBucketSize": null
+    },
+    {
+      "cards": {
+        "cardPadding": null,
+        "cardRound": 1
+      },
+      "color": {
+        "cardColor": "#b4ff00",
+        "colorScale": "sqrt",
+        "colorScheme": "interpolateBlues",
+        "exponent": 0.5,
+        "mode": "spectrum"
+      },
+      "dataFormat": "timeseries",
+      "datasource": "$datasource",
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 6,
+        "y": 15
+      },
+      "heatmap": {},
+      "highlightCards": true,
+      "id": 59,
+      "legend": {
+        "show": true
+      },
+      "links": [],
+      "targets": [
+        {
+          "expr": "ceph_osd_numpg{instance=~\"$instance\"}",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "#PGs",
+          "refId": "A"
+        }
+      ],
+      "title": "PGs per OSD",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "xBucketNumber": null,
+      "xBucketSize": "",
+      "yAxis": {
+        "decimals": null,
+        "format": "none",
+        "logBase": 1,
+        "max": null,
+        "min": null,
+        "show": true,
+        "splitFactor": null
+      },
+      "yBucketNumber": null,
+      "yBucketSize": null
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 15
+      },
+      "id": 64,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(irate(ceph_osd_recovery_ops[1m]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Op/s",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Recovery Rate",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ops",
+          "label": "Recovery Ops/s",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [
+    "ceph",
+    "cluster"
+  ],
+  "templating": {
+    "list": [
+      {
+        "hide": 0,
+        "label": null,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "auto": true,
+        "auto_count": 10,
+        "auto_min": "1m",
+        "current": {
+          "text": "auto",
+          "value": "$__auto_interval_interval"
+        },
+        "datasource": null,
+        "hide": 0,
+        "includeAll": false,
+        "label": "Interval",
+        "multi": false,
+        "name": "interval",
+        "options": [
+          {
+            "selected": true,
+            "text": "auto",
+            "value": "$__auto_interval_interval"
+          },
+          {
+            "selected": false,
+            "text": "1m",
+            "value": "1m"
+          },
+          {
+            "selected": false,
+            "text": "10m",
+            "value": "10m"
+          },
+          {
+            "selected": false,
+            "text": "30m",
+            "value": "30m"
+          },
+          {
+            "selected": false,
+            "text": "1h",
+            "value": "1h"
+          },
+          {
+            "selected": false,
+            "text": "6h",
+            "value": "6h"
+          },
+          {
+            "selected": false,
+            "text": "12h",
+            "value": "12h"
+          },
+          {
+            "selected": false,
+            "text": "1d",
+            "value": "1d"
+          },
+          {
+            "selected": false,
+            "text": "7d",
+            "value": "7d"
+          },
+          {
+            "selected": false,
+            "text": "14d",
+            "value": "14d"
+          },
+          {
+            "selected": false,
+            "text": "30d",
+            "value": "30d"
+          }
+        ],
+        "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+        "refresh": 2,
+        "type": "interval"
+      },
+      {
+        "allFormat": "glob",
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "hideLabel": false,
+        "includeAll": true,
+        "label": "Exporter Instance",
+        "multi": false,
+        "multiFormat": "glob",
+        "name": "instance",
+        "options": [],
+        "query": "label_values(ceph_health_status, instance)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Ceph - Cluster",
+  "version": 13
+    }
diff --git a/dashboards/cephfs-overview.json b/dashboards/cephfs-overview.json
new file mode 100644 (file)
index 0000000..7061392
--- /dev/null
@@ -0,0 +1,294 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534386614546,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 10,
+      "panels": [],
+      "title": "MDS Performance",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(ceph_objecter_op_r{ceph_daemon=~\"($mds_servers).*\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "MDS Reads",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(ceph_objecter_op_w{ceph_daemon=~\"($mds_servers).*\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "MDS Writes",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "MDS Workload - $mds_servers",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "none",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 4,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "ceph_mds_server_handle_client_request{ceph_daemon=~\"($mds_servers).*\"}",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{ceph_daemon}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Client Request Load - $mds_servers",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "none",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "includeAll": true,
+        "label": "MDS Server",
+        "multi": false,
+        "name": "mds_servers",
+        "options": [],
+        "query": "label_values(ceph_mds_inodes, ceph_daemon)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "MDS Performance",
+  "uid": "rRfFzWtik",
+  "version": 2
+}
diff --git a/dashboards/host-details.json b/dashboards/host-details.json
new file mode 100644 (file)
index 0000000..90a048f
--- /dev/null
@@ -0,0 +1,1134 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "singlestat",
+      "name": "Singlestat",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1538021829737,
+  "links": [
+    {
+      "asDropdown": true,
+      "icon": "external link",
+      "tags": [
+        "overview"
+      ],
+      "title": "Shortcuts",
+      "type": "dashboards"
+    }
+  ],
+  "panels": [
+    {
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 16,
+      "title": "$ceph_hosts System Overview",
+      "type": "row"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "rgba(245, 54, 54, 0.9)",
+        "rgba(237, 129, 40, 0.89)",
+        "rgba(50, 172, 45, 0.97)"
+      ],
+      "datasource": "$datasource",
+      "format": "none",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 0,
+        "y": 1
+      },
+      "height": "160",
+      "id": 1,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": "",
+      "minSpan": 4,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "count(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"})",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "refId": "A",
+          "step": 40,
+          "textEditor": true
+        }
+      ],
+      "thresholds": "",
+      "title": "OSDs",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {
+        "interrupt": "#447EBC",
+        "steal": "#6D1F62",
+        "system": "#890F02",
+        "user": "#3F6833",
+        "wait": "#C15C17"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
+      "fill": 3,
+      "gridPos": {
+        "h": 10,
+        "w": 6,
+        "x": 3,
+        "y": 1
+      },
+      "id": 9,
+      "legend": {
+        "alignAsTable": false,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "repeat": null,
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (mode) (\n  irate(node_cpu{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m]) or\n  irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m])\n) / scalar(\n  sum(irate(node_cpu{instance=~\"($ceph_hosts).*\"}[1m]) or\n      irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\"}[1m]))\n) * 100",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "{{mode}}",
+          "refId": "A",
+          "step": 10,
+          "textEditor": true
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "CPU Utilisation",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": "100",
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {
+        "Available": "#508642",
+        "Free": "#508642",
+        "Total": "#bf1b00",
+        "Used": "#bf1b00",
+        "total": "#bf1b00",
+        "used": "#0a50a1"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 10,
+        "w": 6,
+        "x": 9,
+        "y": 1
+      },
+      "id": 14,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "total",
+          "color": "#bf1b00",
+          "fill": 0,
+          "linewidth": 2,
+          "stack": false
+        }
+      ],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"})- (\n  (node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"})  + \n  (node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n  (node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n  (node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"})\n  )\n  \n",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "used",
+          "refId": "D"
+        },
+        {
+          "expr": "node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+          "format": "time_series",
+          "hide": false,
+          "intervalFactor": 1,
+          "legendFormat": "Free",
+          "refId": "A"
+        },
+        {
+          "expr": "(node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n(node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n(node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"}) \n",
+          "format": "time_series",
+          "hide": false,
+          "intervalFactor": 1,
+          "legendFormat": "buffers/cache",
+          "refId": "C"
+        },
+        {
+          "expr": "node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+          "format": "time_series",
+          "hide": false,
+          "intervalFactor": 1,
+          "legendFormat": "total",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "RAM Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
+      "fill": 1,
+      "gridPos": {
+        "h": 10,
+        "w": 6,
+        "x": 15,
+        "y": 1
+      },
+      "id": 10,
+      "legend": {
+        "alignAsTable": false,
+        "avg": false,
+        "current": false,
+        "hideZero": true,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (device) (\n  irate(node_network_receive_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or \n  irate(node_network_receive_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{device}}.rx",
+          "refId": "A",
+          "step": 10,
+          "textEditor": true
+        },
+        {
+          "expr": "sum by (device) (\n  irate(node_network_transmit_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{device}}.tx",
+          "refId": "B",
+          "step": 10
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Network Load",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "decbytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 21,
+        "y": 1
+      },
+      "hideTimeOverride": true,
+      "id": 18,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "irate(node_network_transmit_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m]) + \nirate(node_network_receive_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "format": "time_series",
+          "instant": false,
+          "intervalFactor": 1,
+          "legendFormat": "{{device}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": "15m",
+      "timeShift": null,
+      "title": "Network Drops (last 15mins)",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "rgba(245, 54, 54, 0.9)",
+        "rgba(237, 129, 40, 0.89)",
+        "rgba(50, 172, 45, 0.97)"
+      ],
+      "datasource": "$datasource",
+      "decimals": 0,
+      "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.",
+      "format": "bytes",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 0,
+        "y": 6
+      },
+      "height": "160",
+      "id": 2,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": "",
+      "minSpan": 4,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts).*\"})",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "refId": "A",
+          "step": 40,
+          "textEditor": true
+        }
+      ],
+      "thresholds": "",
+      "title": "Raw Capacity",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 21,
+        "y": 6
+      },
+      "hideTimeOverride": true,
+      "id": 19,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "irate(node_network_transmit_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])+ \nirate(node_network_receive_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "format": "time_series",
+          "instant": false,
+          "intervalFactor": 1,
+          "legendFormat": "{{device}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": "15m",
+      "timeShift": null,
+      "title": "Network Errors(last 15mins)",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "collapsed": true,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 11
+      },
+      "id": 12,
+      "panels": [
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
+          "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
+          "fill": 1,
+          "gridPos": {
+            "h": 9,
+            "w": 11,
+            "x": 0,
+            "y": 12
+          },
+          "id": 6,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "minSpan": 12,
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "((irate(node_disk_reads_completed{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts).*\"}[5m]) )  + \n(irate(node_disk_writes_completed{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts).*\"}[5m]))) *\non(instance, device) group_left(osd_id) label_replace(ceph_disk_occupation, \"osd_id\",\"$1\",\"ceph_daemon\",\"osd.(.*)\") ",
+              "format": "time_series",
+              "intervalFactor": 1,
+              "legendFormat": "{{device}}({{osd_id}})",
+              "refId": "A",
+              "step": 10,
+              "textEditor": true
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "$ceph_hosts Disk IOPS",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "IOPS",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
+          "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.",
+          "fill": 1,
+          "gridPos": {
+            "h": 9,
+            "w": 11,
+            "x": 12,
+            "y": 12
+          },
+          "id": 5,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "minSpan": 12,
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "(\n  (irate(node_disk_io_time_ms[5m]) / 10 ) or\n  irate(node_disk_io_time_seconds_total[5m]) * 100)\n* on(instance, device) group_left(osd_id) label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"}, \"osd_id\",\"$1\",\"ceph_daemon\",\"osd.(.*)\")\n",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 1,
+              "legendFormat": "{{device}}({{osd_id}})",
+              "refId": "C",
+              "step": 10,
+              "textEditor": true
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "$ceph_hosts Disk utilisation",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "short",
+              "label": "%Util",
+              "logBase": 1,
+              "max": "100",
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
+          "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
+          "fill": 1,
+          "gridPos": {
+            "h": 9,
+            "w": 11,
+            "x": 0,
+            "y": 21
+          },
+          "id": 7,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "minSpan": 12,
+          "nullPointMode": "null as zero",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "max by(instance,device) (((\n  (irate(node_disk_write_time_ms{ instance=~\"($ceph_hosts).*\"}[5m]) )\n  / clamp_min(irate(node_disk_writes_completed{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001) or \n  (irate(node_disk_read_time_ms{ instance=~\"($ceph_hosts).*\"}[5m]) )\n  / clamp_min(irate(node_disk_reads_completed{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001)\n  ) / 1000) or\n  (irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) )\n  / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001) or \n  (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) )\n  / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001)\n  ) *\n  on(instance,device) group_left(osd_id) label_replace(ceph_disk_occupation,\"osd_id\",\"$1\",\"ceph_daemon\",\"osd.(.*)\")",
+              "format": "time_series",
+              "hide": false,
+              "intervalFactor": 1,
+              "legendFormat": "{{device}}({{osd_id}})",
+              "refId": "D"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "$ceph_hosts Disk Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "s",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ]
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "$datasource",
+          "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id",
+          "fill": 1,
+          "gridPos": {
+            "h": 9,
+            "w": 11,
+            "x": 12,
+            "y": 21
+          },
+          "id": 8,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "links": [],
+          "minSpan": 12,
+          "nullPointMode": "connected",
+          "percentage": false,
+          "pointradius": 5,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": " ((irate(node_disk_bytes_read{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts).*\"}[5m]))  + \n  (irate(node_disk_bytes_written{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts).*\"}[5m]))\n ) * \n  on(instance,device) group_left(osd_id) label_replace(ceph_disk_occupation,\"osd_id\",\"$1\",\"ceph_daemon\",\"osd.(.*)\")",
+              "format": "time_series",
+              "interval": "",
+              "intervalFactor": 1,
+              "legendFormat": "{{device}}({{osd_id}})",
+              "refId": "A",
+              "step": 10,
+              "textEditor": true
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "$ceph_hosts Throughput by Disk",
+          "tooltip": {
+            "shared": true,
+            "sort": 2,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "decbytes",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ]
+        }
+      ],
+      "repeat": null,
+      "title": "OSD Disk Performance Statistics",
+      "type": "row"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [
+    "overview"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Hostname",
+        "multi": false,
+        "name": "ceph_hosts",
+        "options": [],
+        "query": "label_values(node_scrape_collector_success, instance) ",
+        "refresh": 1,
+        "regex": "([^.]*).*",
+        "sort": 3,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Host Details",
+  "uid": "7IGu2Ttmz",
+  "version": 11
+}
diff --git a/dashboards/hosts-overview.json b/dashboards/hosts-overview.json
new file mode 100644 (file)
index 0000000..718aaa3
--- /dev/null
@@ -0,0 +1,837 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "singlestat",
+      "name": "Singlestat",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1538079414024,
+  "links": [],
+  "panels": [
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "format": "none",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 0,
+        "y": 0
+      },
+      "id": 5,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "count(sum by (instance) (ceph_disk_occupation))",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "OSD Hosts",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "decimals": 0,
+      "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
+      "format": "percentunit",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 4,
+        "y": 0
+      },
+      "id": 6,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "avg(\n  1 - (\n    avg by(instance) \n      (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n       irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n    )\n  )",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "AVG CPU Busy",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "decimals": 0,
+      "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
+      "format": "percentunit",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 8,
+        "y": 0
+      },
+      "id": 9,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "avg (((node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})- (\n  (node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})  + \n  (node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) + \n  (node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n  (node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})\n  )) /\n (node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"[[osd_hosts]]|[[rgw_hosts]]|[[mon_hosts]]|[[mds_hosts]].*\"} ))",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "AVG RAM Utilization",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "description": "IOPS Load at the device as reported by the OS on all OSD hosts",
+      "format": "none",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 12,
+        "y": 0
+      },
+      "id": 2,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "sum ((irate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[5m]) )  + \n(irate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[5m])))",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "Physical IOPS",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)",
+      "format": "percent",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 16,
+        "y": 0
+      },
+      "id": 20,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "avg (\n  ((irate(node_disk_io_time_ms[5m]) / 10 ) or\n   (irate(node_disk_io_time_seconds_total[5m]) * 100)\n  ) *\n  on(instance, device) ceph_disk_occupation{instance=~\"($osd_hosts).*\"}\n)",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "AVG Disk Utilization",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "cacheTimeout": null,
+      "colorBackground": false,
+      "colorValue": false,
+      "colors": [
+        "#299c46",
+        "rgba(237, 129, 40, 0.89)",
+        "#d44a3a"
+      ],
+      "datasource": "$datasource",
+      "decimals": 0,
+      "description": "Total send/receive network load across all hosts in the ceph cluster",
+      "format": "bytes",
+      "gauge": {
+        "maxValue": 100,
+        "minValue": 0,
+        "show": false,
+        "thresholdLabels": false,
+        "thresholdMarkers": true
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 20,
+        "y": 0
+      },
+      "id": 18,
+      "interval": null,
+      "links": [],
+      "mappingType": 1,
+      "mappingTypes": [
+        {
+          "name": "value to text",
+          "value": 1
+        },
+        {
+          "name": "range to text",
+          "value": 2
+        }
+      ],
+      "maxDataPoints": 100,
+      "nullPointMode": "connected",
+      "nullText": null,
+      "postfix": "",
+      "postfixFontSize": "50%",
+      "prefix": "",
+      "prefixFontSize": "50%",
+      "rangeMaps": [
+        {
+          "from": "null",
+          "text": "N/A",
+          "to": "null"
+        }
+      ],
+      "sparkline": {
+        "fillColor": "rgba(31, 118, 189, 0.18)",
+        "full": false,
+        "lineColor": "rgb(31, 120, 193)",
+        "show": false
+      },
+      "tableColumn": "",
+      "targets": [
+        {
+          "expr": "sum (\n  irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n  ) +\nsum (\n  irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n  )",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": "",
+      "title": "Network Load",
+      "type": "singlestat",
+      "valueFontSize": "80%",
+      "valueMaps": [
+        {
+          "op": "=",
+          "text": "N/A",
+          "value": "null"
+        }
+      ],
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Show the top 10 busiest hosts by cpu",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 5
+      },
+      "id": 13,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "topk(10,( 1 - (\n    avg by(instance) \n      (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n       irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n    )\n  )\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "CPU Busy - Top 10 Hosts",
+      "tooltip": {
+        "shared": true,
+        "sort": 1,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 1,
+          "format": "percentunit",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": false
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Top 10 hosts by network load",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 5
+      },
+      "id": 19,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "topk(10, (sum by(instance) (\n  (\n  irate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n  ) +\n  (\n  irate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n  ))\n  )\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Network Load - Top 10",
+      "tooltip": {
+        "shared": true,
+        "sort": 1,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 1,
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": "",
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "osd_hosts",
+        "options": [],
+        "query": "label_values(ceph_disk_occupation, instance)",
+        "refresh": 1,
+        "regex": "([^.]*).*",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "ceph",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "mon_hosts",
+        "options": [],
+        "query": "label_values(ceph_mon_metadata, ceph_daemon)",
+        "refresh": 1,
+        "regex": "mon.(.*)",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "mds_hosts",
+        "options": [],
+        "query": "label_values(ceph_mds_inodes, ceph_daemon)",
+        "refresh": 1,
+        "regex": "mds.(.*)",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "rgw_hosts",
+        "options": [],
+        "query": "label_values(ceph_rgw_qlen, ceph_daemon)",
+        "refresh": 1,
+        "regex": "rgw.(.*)",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "Host Overview",
+  "uid": "lxnjcTAmk",
+  "version": 10
+}
diff --git a/dashboards/osd-device-details.json b/dashboards/osd-device-details.json
new file mode 100644 (file)
index 0000000..8820925
--- /dev/null
@@ -0,0 +1,740 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534385833420,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 14,
+      "panels": [],
+      "title": "OSD Performance",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 0,
+        "y": 1
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "irate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"osd.[[osd_id]]\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "READs",
+          "refId": "A"
+        },
+        {
+          "expr": "irate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"osd.[[osd_id]]\"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "WRITEs",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD $osd_id Latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 6,
+        "y": 1
+      },
+      "id": 8,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "irate(ceph_osd_op_r{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Reads",
+          "refId": "A"
+        },
+        {
+          "expr": "irate(ceph_osd_op_w{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Writes",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD $osd_id R/W IOPS",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 12,
+        "y": 1
+      },
+      "id": 7,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "irate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Read Bytes",
+          "refId": "A"
+        },
+        {
+          "expr": "irate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"osd.[[osd_id]]\"}[1m])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Write Bytes",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD $osd_id R/W Bytes",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 10
+      },
+      "id": 12,
+      "panels": [],
+      "title": "Physical Device Performance",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 0,
+        "y": 11
+      },
+      "id": 9,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(irate(node_disk_read_time_ms[1m]) / irate(node_disk_reads_completed[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}) ",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}}/{{device}} Reads",
+          "refId": "A"
+        },
+        {
+          "expr": "(irate(node_disk_write_time_ms[1m]) / irate(node_disk_writes_completed[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}) ",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}}/{{device}} Writes",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Physical Device Latency for OSD $osd_id",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 6,
+        "y": 11
+      },
+      "id": 5,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(irate(node_disk_reads_completed[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}} {{device}} READS",
+          "refId": "A"
+        },
+        {
+          "expr": "(irate(node_disk_writes_completed[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}) ",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}} {{device}} WRITES",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Physical Device R/W IOPS for OSD $osd_id",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 12,
+        "y": 11
+      },
+      "id": 10,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(irate(node_disk_bytes_read[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}} {{device}} READS",
+          "refId": "A"
+        },
+        {
+          "expr": "(irate(node_disk_bytes_written[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}) ",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}} {{device}} WRITES",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Physical Device R/W Bytes for OSD $osd_id",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 18,
+        "y": 11
+      },
+      "id": 4,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(irate(node_disk_io_time_ms[1m]) and on (instance, device) ceph_disk_occupation{ceph_daemon=~\"osd.[[osd_id]]\"}) / 10",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{instance}} {{device}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Physical Device Util% for OSD $osd_id",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    }
+  ],
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "includeAll": false,
+        "label": "OSD Id",
+        "multi": false,
+        "name": "osd_id",
+        "options": [],
+        "query": "label_values(ceph_osd_metadata,ceph_daemon)",
+        "refresh": 1,
+        "regex": "osd.(.*)",
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "OSD device details",
+  "uid": "MKj_9ipiz",
+  "version": 3
+}
diff --git a/dashboards/osds-overview.json b/dashboards/osds-overview.json
new file mode 100644 (file)
index 0000000..3fc6d31
--- /dev/null
@@ -0,0 +1,876 @@
+{
+
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "grafana-piechart-panel",
+      "name": "Pie Chart",
+      "version": "1.3.3"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "table",
+      "name": "Table",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1538083987689,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {
+        "@95%ile": "#e0752d"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 12,
+      "legend": {
+        "avg": false,
+        "current": true,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "AVG read",
+          "refId": "A"
+        },
+        {
+          "expr": "max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "MAX read",
+          "refId": "B"
+        },
+        {
+          "expr": "quantile(0.95,\n  (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "@95%ile",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD Read Latencies",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "columns": [],
+      "datasource": "$datasource",
+      "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 8,
+        "y": 0
+      },
+      "id": 15,
+      "links": [],
+      "pageSize": null,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 2,
+        "desc": true
+      },
+      "styles": [
+        {
+          "alias": "OSD ID",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "ceph_daemon",
+          "thresholds": [],
+          "type": "string",
+          "unit": "short"
+        },
+        {
+          "alias": "Latency (ms)",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 0,
+          "pattern": "Value",
+          "thresholds": [],
+          "type": "number",
+          "unit": "none"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "/.*/",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        }
+      ],
+      "targets": [
+        {
+          "expr": "topk(10,\n  (sort(\n    (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n  ))\n)\n\n",
+          "format": "table",
+          "instant": true,
+          "intervalFactor": 1,
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Highest READ Latencies",
+      "transform": "table",
+      "type": "table"
+    },
+    {
+      "aliasColors": {
+        "@95%ile write": "#e0752d"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 12,
+        "y": 0
+      },
+      "id": 13,
+      "legend": {
+        "avg": false,
+        "current": true,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "AVG write",
+          "refId": "A"
+        },
+        {
+          "expr": "max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "MAX write",
+          "refId": "B"
+        },
+        {
+          "expr": "quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "@95%ile write",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "OSD Write Latencies",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "columns": [],
+      "datasource": "$datasource",
+      "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 20,
+        "y": 0
+      },
+      "id": 16,
+      "links": [],
+      "pageSize": null,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 2,
+        "desc": true
+      },
+      "styles": [
+        {
+          "alias": "OSD ID",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "ceph_daemon",
+          "thresholds": [],
+          "type": "string",
+          "unit": "short"
+        },
+        {
+          "alias": "Latency (ms)",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 0,
+          "pattern": "Value",
+          "thresholds": [],
+          "type": "number",
+          "unit": "none"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "/.*/",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        }
+      ],
+      "targets": [
+        {
+          "expr": "topk(10,\n  (sort(\n    (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n  ))\n)\n\n",
+          "format": "table",
+          "instant": true,
+          "intervalFactor": 1,
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "Highest WRITE Latencies",
+      "transform": "table",
+      "type": "table"
+    },
+    {
+      "aliasColors": {},
+      "breakPoint": "50%",
+      "cacheTimeout": null,
+      "combine": {
+        "label": "Others",
+        "threshold": 0
+      },
+      "datasource": "$datasource",
+      "fontSize": "80%",
+      "format": "none",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 0,
+        "y": 8
+      },
+      "id": 2,
+      "interval": null,
+      "legend": {
+        "show": true,
+        "values": true
+      },
+      "legendType": "Under graph",
+      "links": [],
+      "maxDataPoints": 3,
+      "nullPointMode": "connected",
+      "pieType": "pie",
+      "strokeWidth": 1,
+      "targets": [
+        {
+          "expr": "count by(device_class) (ceph_osd_metadata)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{device_class}}",
+          "refId": "A"
+        }
+      ],
+      "title": "OSD Types Summary",
+      "type": "grafana-piechart-panel",
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {
+        "Non-Encrypted": "#E5AC0E"
+      },
+      "breakPoint": "50%",
+      "cacheTimeout": null,
+      "combine": {
+        "label": "Others",
+        "threshold": 0
+      },
+      "datasource": "$datasource",
+      "fontSize": "80%",
+      "format": "none",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 4,
+        "y": 8
+      },
+      "height": "200px",
+      "hideTimeOverride": true,
+      "id": 4,
+      "interval": null,
+      "legend": {
+        "percentage": false,
+        "show": true,
+        "values": true
+      },
+      "legendType": "Under graph",
+      "links": [],
+      "maxDataPoints": "1",
+      "minSpan": 4,
+      "nullPointMode": "connected",
+      "pieType": "pie",
+      "strokeWidth": 1,
+      "targets": [
+        {
+          "expr": "count(ceph_bluefs_wal_total_bytes)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "bluestore",
+          "refId": "A",
+          "step": 240
+        },
+        {
+          "expr": "count(ceph_osd_metadata) - count(ceph_bluefs_wal_total_bytes)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "filestore",
+          "refId": "B",
+          "step": 240
+        },
+        {
+          "expr": "absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "filestore",
+          "refId": "C",
+          "step": 240
+        }
+      ],
+      "timeFrom": "2m",
+      "timeShift": null,
+      "title": "OSD Objectstore Types",
+      "type": "grafana-piechart-panel",
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {},
+      "breakPoint": "50%",
+      "cacheTimeout": null,
+      "combine": {
+        "label": "Others",
+        "threshold": "0.05"
+      },
+      "datasource": "$datasource",
+      "description": "The pie chart shows the various OSD sizes used within the cluster",
+      "fontSize": "80%",
+      "format": "none",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 8,
+        "y": 8
+      },
+      "height": "220",
+      "hideTimeOverride": true,
+      "id": 8,
+      "interval": null,
+      "legend": {
+        "header": "",
+        "percentage": false,
+        "show": true,
+        "sideWidth": null,
+        "sortDesc": true,
+        "values": true
+      },
+      "legendType": "Under graph",
+      "links": [],
+      "maxDataPoints": "",
+      "minSpan": 6,
+      "nullPointMode": "connected",
+      "pieType": "pie",
+      "strokeWidth": "1",
+      "targets": [
+        {
+          "expr": "count(ceph_osd_stat_bytes < 1099511627776)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<1 TB",
+          "refId": "A",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<2 TB",
+          "refId": "B",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<3TB",
+          "refId": "C",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<4TB",
+          "refId": "D",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<6TB",
+          "refId": "E",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<8TB",
+          "refId": "F",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<10TB",
+          "refId": "G",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "<12TB",
+          "refId": "H",
+          "step": 2
+        },
+        {
+          "expr": "count(ceph_osd_stat_bytes >= 13194139533312)",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "12TB+",
+          "refId": "I",
+          "step": 2
+        }
+      ],
+      "timeFrom": "2m",
+      "timeShift": null,
+      "title": "OSD Size Summary",
+      "type": "grafana-piechart-panel",
+      "valueName": "current"
+    },
+    {
+      "aliasColors": {},
+      "bars": true,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Each bar indicates the number of OSD's that have a PG count in a specific range as shown on the x axis.",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 6,
+      "legend": {
+        "alignAsTable": false,
+        "avg": false,
+        "current": false,
+        "hideEmpty": false,
+        "hideZero": false,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": false,
+        "total": false,
+        "values": false
+      },
+      "lines": false,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "ceph_osd_numpg\n",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "legendFormat": "PGs per OSD",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Distribution of PGs per OSD",
+      "tooltip": {
+        "shared": false,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": 20,
+        "mode": "histogram",
+        "name": null,
+        "show": true,
+        "values": [
+          "total"
+        ]
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "short",
+          "label": "# of OSDs",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "id": 20,
+      "panels": [],
+      "title": "R/W Profile",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Show the read/write workload profile overtime",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 17
+      },
+      "id": 10,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "round(sum(irate(ceph_pool_rd[30s])))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Reads",
+          "refId": "A"
+        },
+        {
+          "expr": "round(sum(irate(ceph_pool_wr[30s])))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Writes",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": "36h",
+      "timeShift": null,
+      "title": "Read/Write Profile",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+        {
+          "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+          },
+          "hide": 0,
+          "label": "Data Source",
+          "name": "datasource",
+          "options": [],
+          "query": "prometheus",
+          "refresh": 1,
+          "regex": "",
+          "type": "datasource"
+        }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "OSD Overview",
+  "uid": "lo02I1Aiz",
+  "version": 3
+}
diff --git a/dashboards/pool-detail.json b/dashboards/pool-detail.json
new file mode 100644 (file)
index 0000000..64f34ac
--- /dev/null
@@ -0,0 +1,336 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "text",
+      "name": "Text",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534394258671,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 16,
+      "panels": [],
+      "repeat": null,
+      "title": "Pool '$pool_name' Performance Details",
+      "type": "row"
+    },
+    {
+      "aliasColors": {
+        "read_op_per_sec": "#3F6833",
+        "write_op_per_sec": "#E5AC0E"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 6,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(label_replace(irate(ceph_pool_rd[1m]),\"id\",\"$1\",\"pool_id\",\"(.*)\") * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"[[pool_name]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "reads",
+          "refId": "B"
+        },
+        {
+          "expr": "(label_replace(irate(ceph_pool_wr[1m]),\"id\",\"$1\",\"pool_id\",\"(.*)\") * on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"[[pool_name]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "writes",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Pool '$pool_name' Client IOPS",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "none",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {
+        "read_op_per_sec": "#3F6833",
+        "write_op_per_sec": "#E5AC0E"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 7,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(label_replace(irate(ceph_pool_rd_bytes[1m]),\"id\",\"$1\",\"pool_id\",\"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"[[pool_name]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "reads",
+          "refId": "A"
+        },
+        {
+          "expr": "(label_replace(irate(ceph_pool_wr_bytes[1m]),\"id\",\"$1\",\"pool_id\",\"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"[[pool_name]]\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "writes",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Pool '$pool_name' Client Throughput",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "decbytes",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "pool_id",
+        "options": [],
+        "query": "label_values(ceph_pool_metadata,pool_id)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "current": {
+        "tags": [],
+        "text": "default",
+        "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Pool Name",
+        "multi": false,
+        "name": "pool_name",
+        "options": [],
+        "query": "label_values(ceph_pool_metadata,name)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Ceph Pool Detail",
+  "uid": "8ypfkWpik",
+  "version": 11
+}
diff --git a/dashboards/pool-overview.json b/dashboards/pool-overview.json
new file mode 100644 (file)
index 0000000..505108d
--- /dev/null
@@ -0,0 +1,745 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "table",
+      "name": "Table",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534386772937,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 14,
+      "panels": [],
+      "repeat": null,
+      "title": "Pool Overview",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 5,
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(label_replace((rate(ceph_pool_rd{pool_id=~\"[[pool_id]]\"}[1m]) + rate(ceph_pool_wr{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) ",
+          "format": "time_series",
+          "hide": false,
+          "intervalFactor": 1,
+          "legendFormat": "{{name}}",
+          "refId": "F"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Client IOPS by Pool",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "none",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 5,
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "minSpan": 12,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "(label_replace((rate(ceph_pool_rd_bytes{pool_id=~\"[[pool_id]]\"}[1m]) + rate(ceph_pool_wr_bytes{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) ",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{name}}",
+          "refId": "A",
+          "textEditor": true
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Client Throughput by Pool",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "decbytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 8
+      },
+      "id": 15,
+      "panels": [],
+      "repeat": null,
+      "title": "Top 5's",
+      "type": "row"
+    },
+    {
+      "columns": [
+        {
+          "text": "Current",
+          "value": "current"
+        }
+      ],
+      "datasource": "$datasource",
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 9
+      },
+      "id": 3,
+      "links": [],
+      "minSpan": 12,
+      "pageSize": null,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 6,
+        "desc": true
+      },
+      "styles": [
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "Time",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "id",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "instance",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "job",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool Name",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "name",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool ID",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "pool_id",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        },
+        {
+          "alias": "IOPS (R+W)",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 0,
+          "pattern": "Value",
+          "thresholds": [],
+          "type": "number",
+          "unit": "none"
+        }
+      ],
+      "targets": [
+        {
+          "expr": "topk(5,(label_replace((irate(ceph_pool_rd{pool_id=~\"[[pool_id]]\"}[1m]) + irate(ceph_pool_wr{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) )",
+          "format": "table",
+          "instant": true,
+          "intervalFactor": 2,
+          "refId": "A",
+          "textEditor": true
+        }
+      ],
+      "title": "Top 5 Pools by Client IOPS",
+      "transform": "table",
+      "type": "table"
+    },
+    {
+      "columns": [
+        {
+          "text": "Current",
+          "value": "current"
+        }
+      ],
+      "datasource": "$datasource",
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 8,
+        "y": 9
+      },
+      "id": 4,
+      "links": [],
+      "minSpan": 12,
+      "pageSize": null,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 6,
+        "desc": true
+      },
+      "styles": [
+        {
+          "alias": "Time",
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "pattern": "Time",
+          "type": "hidden"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "id",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "instance",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "job",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool Name",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "name",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool ID",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "pool_id",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        },
+        {
+          "alias": "Throughput",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "Value",
+          "thresholds": [],
+          "type": "number",
+          "unit": "decbytes"
+        }
+      ],
+      "targets": [
+        {
+          "expr": "(label_replace((irate(ceph_pool_rd_bytes{pool_id=~\"[[pool_id]]\"}[1m]) + irate(ceph_pool_wr_bytes{pool_id=~\"[[pool_id]]\"}[1m])),\"id\", \"$1\", \"pool_id\", \"(.*)\") + on(pool_id) group_left(instance,name) ceph_pool_metadata{pool_id=~\"[[pool_id]]\"}) ",
+          "format": "table",
+          "instant": true,
+          "intervalFactor": 2,
+          "refId": "A",
+          "textEditor": true
+        }
+      ],
+      "title": "Top 5 Pools by Throughput",
+      "transform": "table",
+      "type": "table"
+    },
+    {
+      "columns": [],
+      "datasource": "$datasource",
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 16,
+        "y": 9
+      },
+      "id": 5,
+      "links": [],
+      "minSpan": 8,
+      "pageSize": null,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 5,
+        "desc": true
+      },
+      "styles": [
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "Time",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "instance",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "job",
+          "thresholds": [],
+          "type": "hidden",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool Name",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "name",
+          "thresholds": [],
+          "type": "string",
+          "unit": "short"
+        },
+        {
+          "alias": "Pool ID",
+          "colorMode": null,
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "pool_id",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        },
+        {
+          "alias": "Capacity Used",
+          "colorMode": "value",
+          "colors": [
+            "rgba(50, 172, 45, 0.97)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(245, 54, 54, 0.9)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 2,
+          "pattern": "Value",
+          "thresholds": [
+            "70",
+            "85"
+          ],
+          "type": "number",
+          "unit": "percentunit"
+        }
+      ],
+      "targets": [
+        {
+          "expr": "topk(5,((ceph_pool_bytes_used / (ceph_pool_bytes_used + ceph_pool_max_avail)) * on(pool_id) group_left(name) ceph_pool_metadata))",
+          "format": "table",
+          "hide": false,
+          "instant": true,
+          "intervalFactor": 1,
+          "legendFormat": "",
+          "refId": "D"
+        }
+      ],
+      "title": "Top 5 Pools By Capacity Used",
+      "transform": "table",
+      "type": "table"
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "pool_id",
+        "options": [],
+        "query": "label_values(ceph_pool_metadata,pool_id)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": "Pool Name",
+        "multi": false,
+        "name": "pool_name",
+        "options": [],
+        "query": "label_values(ceph_pool_metadata,name)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "current": {
+          "tags": [],
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Ceph Pools Overview",
+  "uid": "z99hzWtmk",
+  "version": 3
+}
diff --git a/dashboards/radosgw-detail.json b/dashboards/radosgw-detail.json
new file mode 100644 (file)
index 0000000..8aa0241
--- /dev/null
@@ -0,0 +1,486 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "grafana-piechart-panel",
+      "name": "Pie Chart",
+      "version": "1.3.3"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534386250869,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 12,
+      "panels": [],
+      "repeat": null,
+      "title": "RGW Host Detail : $rgw_servers",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 0,
+        "y": 1
+      },
+      "id": 34,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(ceph_rgw_get_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_get_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GET",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(ceph_rgw_put_initial_lat_sum{ceph_daemon=~\"($rgw_servers)\"}[30s]) / rate(ceph_rgw_put_initial_lat_count{ceph_daemon=~\"($rgw_servers)\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUT",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "$rgw_servers GET/PUT Latencies",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 7,
+        "x": 6,
+        "y": 1
+      },
+      "id": 18,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(ceph_rgw_get_b{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GETs",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(ceph_rgw_put_b{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUTs",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Bandwidth by HTTP Operation",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "bytes",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {
+        "GETs": "#7eb26d",
+        "Other": "#447ebc",
+        "PUTs": "#eab839",
+        "Requests": "#3f2b5b",
+        "Requests Failed": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 8,
+        "w": 7,
+        "x": 13,
+        "y": 1
+      },
+      "id": 14,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Requests Failed",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GETs",
+          "refId": "C"
+        },
+        {
+          "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUTs",
+          "refId": "D"
+        },
+        {
+          "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n  (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n   rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Other",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "HTTP Request Breakdown",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {
+        "Failures": "#bf1b00",
+        "GETs": "#7eb26d",
+        "Other (HEAD,POST,DELETE)": "#447ebc",
+        "PUTs": "#eab839"
+      },
+      "breakPoint": "50%",
+      "cacheTimeout": null,
+      "combine": {
+        "label": "Others",
+        "threshold": 0
+      },
+      "datasource": "$datasource",
+      "fontSize": "80%",
+      "format": "none",
+      "gridPos": {
+        "h": 8,
+        "w": 4,
+        "x": 20,
+        "y": 1
+      },
+      "id": 23,
+      "interval": null,
+      "legend": {
+        "show": true,
+        "values": true
+      },
+      "legendType": "Under graph",
+      "links": [],
+      "maxDataPoints": 3,
+      "nullPointMode": "connected",
+      "pieType": "pie",
+      "strokeWidth": 1,
+      "targets": [
+        {
+          "expr": "rate(ceph_rgw_failed_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Failures",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GETs",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s])",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUTs",
+          "refId": "C"
+        },
+        {
+          "expr": "rate(ceph_rgw_req{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) -\n  (rate(ceph_rgw_get{ceph_daemon=~\"[[rgw_servers]]\"}[30s]) +\n   rate(ceph_rgw_put{ceph_daemon=~\"[[rgw_servers]]\"}[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Other (DELETE,LIST)",
+          "refId": "D"
+        }
+      ],
+      "title": "Workload Breakdown",
+      "type": "grafana-piechart-panel",
+      "valueName": "current"
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [
+    "overview"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+        "tags": [],
+        "text": "default",
+        "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 0,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "rgw_servers",
+        "options": [],
+        "query": "label_values(ceph_rgw_req, ceph_daemon)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "RGW Instance Detail",
+  "uid": "x5ARzZtmk",
+  "version": 2
+}
diff --git a/dashboards/radosgw-overview.json b/dashboards/radosgw-overview.json
new file mode 100644 (file)
index 0000000..f950bc4
--- /dev/null
@@ -0,0 +1,630 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534386107523,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "panels": [],
+      "title": "RGW Overview - All Gateways",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 1
+      },
+      "id": 29,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "avg(rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GET AVG",
+          "refId": "A"
+        },
+        {
+          "expr": "avg(rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUT AVG",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Average GET/PUT Latencies",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 7,
+        "x": 8,
+        "y": 1
+      },
+      "id": 4,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by(rgw_host) (label_replace(rate(ceph_rgw_req[30s]), \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{rgw_host}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Total Requests/sec by RGW Instance",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "none",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 6,
+        "x": 15,
+        "y": 1
+      },
+      "id": 31,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "label_replace(rate(ceph_rgw_get_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_get_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{rgw_host}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "GET Latencies by RGW Instance",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": null,
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": false
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Total bytes transferred in/out of all radosgw instances within the cluster",
+      "fill": 1,
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 0,
+        "y": 8
+      },
+      "id": 6,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(ceph_rgw_get_b[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "GETs",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(ceph_rgw_put_b[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "PUTs",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Bandwidth Consumed by Type",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Total bytes transferred in/out through get/put operations, by radosgw instance",
+      "fill": 1,
+      "gridPos": {
+        "h": 6,
+        "w": 7,
+        "x": 8,
+        "y": 8
+      },
+      "id": 9,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by(rgw_host) (\n  (label_replace(rate(ceph_rgw_get_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")) + \n  (label_replace(rate(ceph_rgw_put_b[30s]), \"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\"))\n)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{rgw_host}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Bandwidth by RGW Instance",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts",
+      "fill": 1,
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 15,
+        "y": 8
+      },
+      "id": 32,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "label_replace(rate(ceph_rgw_put_initial_lat_sum[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\") / \nlabel_replace(rate(ceph_rgw_put_initial_lat_count[30s]),\"rgw_host\",\"$1\",\"ceph_daemon\",\"rgw.(.*)\")",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{rgw_host}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "PUT Latencies by RGW Instance",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": null,
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": false
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [
+    "overview"
+  ],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "rgw_servers",
+        "options": [],
+        "query": "label_values(ceph_rgw_req, ceph_daemon)",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "current": {
+        "tags": [],
+        "text": "default",
+        "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "RGW Overview",
+  "uid": "WAkugZpiz",
+  "version": 2
+}
diff --git a/patches/0001-ansible-Disable-devel_mode.patch b/patches/0001-ansible-Disable-devel_mode.patch
new file mode 100644 (file)
index 0000000..96ae643
--- /dev/null
@@ -0,0 +1,27 @@
+From 49ffd15645a8b377b600f44102cad613a71fdd2b Mon Sep 17 00:00:00 2001
+From: Boris Ranto <branto@redhat.com>
+Date: Fri, 6 Oct 2017 12:22:37 +0200
+Subject: [PATCH] ansible: Disable devel_mode
+
+Signed-off-by: Boris Ranto <branto@redhat.com>
+---
+ ansible/roles/ceph-defaults/tasks/merge_vars.yml | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/ansible/roles/ceph-defaults/tasks/merge_vars.yml b/ansible/roles/ceph-defaults/tasks/merge_vars.yml
+index f8dbcd0..15d2a6b 100644
+--- a/ansible/roles/ceph-defaults/tasks/merge_vars.yml
++++ b/ansible/roles/ceph-defaults/tasks/merge_vars.yml
+@@ -3,3 +3,9 @@
+   set_fact: {"{{ item }}": "{% if vars[item] is not defined %}{{ defaults[item] }}{% elif vars[item] is mapping %}{{ defaults[item]|combine(vars[item]|default({})) }}{% else %}{{ vars[item] }}{% endif %}"}
+   with_items: "{{ defaults.keys() }}"
+   no_log: true
++
++- name: Make sure devel_mode is not on
++  assert:
++    that:
++      - devel_mode == False
++    msg: "Devel mode is not supported in the downstream builds"
+-- 
+2.9.5
+
diff --git a/screenshots/archive/dashboard-2017-05-19.png b/screenshots/archive/dashboard-2017-05-19.png
new file mode 100644 (file)
index 0000000..fde57c5
Binary files /dev/null and b/screenshots/archive/dashboard-2017-05-19.png differ
diff --git a/screenshots/archive/dashboard-2017-05-24.png b/screenshots/archive/dashboard-2017-05-24.png
new file mode 100644 (file)
index 0000000..3701904
Binary files /dev/null and b/screenshots/archive/dashboard-2017-05-24.png differ
diff --git a/screenshots/archive/dashboard-2017-05-26.png b/screenshots/archive/dashboard-2017-05-26.png
new file mode 100644 (file)
index 0000000..611f9d6
Binary files /dev/null and b/screenshots/archive/dashboard-2017-05-26.png differ
diff --git a/screenshots/archive/dashboard-2017-05-29.png b/screenshots/archive/dashboard-2017-05-29.png
new file mode 100644 (file)
index 0000000..6089f79
Binary files /dev/null and b/screenshots/archive/dashboard-2017-05-29.png differ
diff --git a/screenshots/at-a-glance.png b/screenshots/at-a-glance.png
new file mode 100644 (file)
index 0000000..466a879
Binary files /dev/null and b/screenshots/at-a-glance.png differ
diff --git a/screenshots/ceph-backend.png b/screenshots/ceph-backend.png
new file mode 100644 (file)
index 0000000..8da34f0
Binary files /dev/null and b/screenshots/ceph-backend.png differ
diff --git a/screenshots/ceph-frontend.png b/screenshots/ceph-frontend.png
new file mode 100644 (file)
index 0000000..cb3dee4
Binary files /dev/null and b/screenshots/ceph-frontend.png differ
diff --git a/screenshots/ceph-rados.png b/screenshots/ceph-rados.png
new file mode 100644 (file)
index 0000000..630d4ad
Binary files /dev/null and b/screenshots/ceph-rados.png differ
diff --git a/screenshots/ceph-rgw.png b/screenshots/ceph-rgw.png
new file mode 100644 (file)
index 0000000..ad271c9
Binary files /dev/null and b/screenshots/ceph-rgw.png differ
diff --git a/screenshots/disk-busy-by-server.png b/screenshots/disk-busy-by-server.png
new file mode 100644 (file)
index 0000000..2b5d4a2
Binary files /dev/null and b/screenshots/disk-busy-by-server.png differ
diff --git a/screenshots/disk-latency-by-server.png b/screenshots/disk-latency-by-server.png
new file mode 100644 (file)
index 0000000..8bfa17a
Binary files /dev/null and b/screenshots/disk-latency-by-server.png differ
diff --git a/screenshots/iops-by-server.png b/screenshots/iops-by-server.png
new file mode 100644 (file)
index 0000000..189ca46
Binary files /dev/null and b/screenshots/iops-by-server.png differ
diff --git a/screenshots/network-load.png b/screenshots/network-load.png
new file mode 100644 (file)
index 0000000..6f49966
Binary files /dev/null and b/screenshots/network-load.png differ
diff --git a/screenshots/osd-node-details.png b/screenshots/osd-node-details.png
new file mode 100644 (file)
index 0000000..21be086
Binary files /dev/null and b/screenshots/osd-node-details.png differ
diff --git a/tests/testosd.py b/tests/testosd.py
new file mode 100644 (file)
index 0000000..53dc139
--- /dev/null
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+
+from collectors.osd import OSDs
+from collectors.common import flatten_dict
+
+import time
+
+def main():
+    o = OSDs('ceph')
+    ctr = 0
+    while ctr < 30:
+
+        s = o.get_stats()
+        print(s)
+        print(flatten_dict(s))
+
+        time.sleep(1)
+        ctr += 1
+
+if __name__ == "__main__":
+    main()
diff --git a/tox.ini b/tox.ini
new file mode 100644 (file)
index 0000000..26a4a0f
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,47 @@
+[tox]
+skipsdist = True
+envlist=ansible-lint,ansible-syntax,flake8,dashboards
+
+[testenv:ansible-lint]
+install_command = pip install --upgrade {opts} {packages}
+deps=
+  ansible-lint
+commands=ansible-lint -x ANSIBLE0010,ANSIBLE0012,ANSIBLE0017 ansible/playbook.yml
+
+[testenv:ansible-syntax]
+install_command = pip install --upgrade {opts} {packages}
+deps=
+  ansible
+changedir=ansible
+commands=
+  ansible-playbook -i '127.0.0.1,' playbook.yml --syntax-check -vv
+
+[testenv:flake8]
+install_command = pip install --upgrade {opts} {packages}
+deps=
+  flake8
+commands=flake8 --select=F,E9 {posargs:*.py collectors tests}
+
+# Integration tests must operate against a live deployment. To run, simply:
+#   tox -e integration /path/to/inventory
+# NOTE: A current limitation of these tests is that they assume that defaults
+#       were used for things like ports, usernames, etc. They do, however,
+#       support devel_mode=True/False.
+[testenv:integration]
+install_command = pip install --upgrade {opts} {packages}
+deps=
+  ansible
+  pytest
+  pytest-xdist
+  testinfra
+changedir=ansible
+commands=
+  py.test -v -n auto --connection=ansible --ansible-inventory {posargs} ./roles/
+
+[testenv:dashboards]
+install_command = pip install --upgrade {opts} {packages}
+deps=
+  pytest
+changedir=dashboards
+commands=
+  py.test -v ./