From 21b9cdf1992f4133d2deb577173be18d6aac827b Mon Sep 17 00:00:00 2001 From: Paul Cuzner Date: Wed, 16 May 2018 20:41:16 +1200 Subject: [PATCH] Changes to health history and inclusion of pg information --- dashboards/mgr-prometheus/ceph-health.json | 562 +++++++++++++-------- 1 file changed, 346 insertions(+), 216 deletions(-) diff --git a/dashboards/mgr-prometheus/ceph-health.json b/dashboards/mgr-prometheus/ceph-health.json index 9fb9177..707c92c 100644 --- a/dashboards/mgr-prometheus/ceph-health.json +++ b/dashboards/mgr-prometheus/ceph-health.json @@ -48,7 +48,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1526260871845, + "iteration": 1526456532165, "links": [ { "asDropdown": true, @@ -189,16 +189,14 @@ "panels": [ { "aliasColors": { - "Ceph Health": "#0a50a1", - "Ceph Health (0:OK, 4:Warning,8:Error)": "#DEDAF7", - "collectd.obj-mon-1.storage.lab.cephmetrics.gauge.ceph.mon.health": "#DEDAF7" + "Ceph Health": "#0a50a1" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": null, - "description": "The chart plots the clusters health, over time. Health is depicted as a integer; 0, 1 or 2 where 0 is OK, 1 is WARN and 2 represents an ERROR state. Horizontal thresholds (drawn in yellow and red) indicate when the cluster's health transitions to 'warning' or 'error'.", - "fill": 1, + "description": "The chart plots the clusters health, over time. The colored bands show 3 distinct areas; green (OK), yellow(WARN) and red(ERROR). The plot line in blue is this clusters current health, so you can see over time how long the cluster spends in an OK, WARN or ERROR state", + "fill": 0, "gridPos": { "h": 9, "w": 24, @@ -233,7 +231,7 @@ "steppedLine": true, "targets": [ { - "expr": "ceph_health_status", + "expr": "ceph_health_status ", "format": "time_series", "instant": false, "intervalFactor": 2, @@ -246,21 +244,27 @@ "thresholds": [ { "colorMode": "custom", - "fill": false, - "fillColor": "rgba(222, 226, 0, 0.47)", - "line": true, - "lineColor": "rgb(247, 172, 0)", - "op": "gt", - "value": 1 + "fill": true, + "fillColor": "#9ac48a", + "line": false, + "op": "lt", + "value": 0.1 + }, + { + "colorMode": "custom", + "fill": true, + "fillColor": "rgba(244, 213, 152, 0.58)", + "line": false, + "op": "lt", + "value": 1.1 }, { "colorMode": "custom", - "fill": false, - "fillColor": "rgba(246, 3, 3, 0.5)", - "line": true, - "lineColor": "rgb(203, 0, 0)", + "fill": true, + "fillColor": "rgba(163, 0, 0, 0.3)", + "line": false, "op": "gt", - "value": 2 + "value": 1.1 } ], "timeFrom": "3d", @@ -285,7 +289,7 @@ "label": "", "logBase": 1, "max": "2", - "min": "0", + "min": "-0.5", "show": false }, { @@ -1309,6 +1313,280 @@ "x": 0, "y": 12 }, + "id": 64, + "panels": [ + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "datasource": null, + "filterNull": false, + "fontSize": "100%", + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 13 + }, + "id": 18, + "links": [], + "minSpan": 6, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": null, + "desc": false + }, + "styles": [ + { + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "Object State", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "Metric", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "Count", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "pattern": "Current", + "thresholds": [], + "type": "number", + "unit": "none" + }, + { + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 0, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "none" + } + ], + "targets": [ + { + "alias": "Objects", + "dsType": "influxdb", + "expr": "ceph_cluster_total_objects", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 1, + "legendFormat": "Total Objects", + "measurement": "collectd.obj-mon-1.storage.lab.cephmetrics.gauge.ceph.mon.num_object", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "step": 20, + "tags": [], + "textEditor": true + }, + { + "expr": "", + "format": "table", + "intervalFactor": 1, + "legendFormat": "Objects misplaced", + "refId": "B" + }, + { + "expr": "", + "format": "table", + "intervalFactor": 1, + "legendFormat": "Objects degraded", + "refId": "C" + }, + { + "expr": "", + "format": "table", + "intervalFactor": 1, + "legendFormat": "Objects unfound", + "refId": "D" + } + ], + "title": "Object Summary", + "transform": "timeseries_aggregations", + "type": "table" + }, + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "datasource": null, + "filterNull": false, + "fontSize": "100%", + "gridPos": { + "h": 7, + "w": 5, + "x": 5, + "y": 13 + }, + "id": 20, + "links": [], + "minSpan": 6, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": null, + "desc": false + }, + "styles": [ + { + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "PG State", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "Metric", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "Count", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "pattern": "Current", + "thresholds": [], + "type": "number", + "unit": "none" + }, + { + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 0, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "none" + } + ], + "targets": [ + { + "expr": "ceph_pg_total", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "PGs", + "refId": "A", + "step": 20 + }, + { + "expr": "ceph_pg_active", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "PGs Active", + "refId": "B", + "step": 20 + }, + { + "expr": "ceph_pg_clean", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "PGs Active+clean", + "refId": "C", + "step": 20 + }, + { + "expr": "ceph_pg_peering", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "PGs peering", + "refId": "D", + "step": 20 + } + ], + "title": "PG Summary", + "transform": "timeseries_aggregations", + "type": "table" + } + ], + "title": "RADOS Information", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, "id": 24, "panels": [ { @@ -1322,9 +1600,9 @@ "fontSize": "100%", "gridPos": { "h": 7, - "w": 2, + "w": 3, "x": 0, - "y": 22 + "y": 14 }, "id": 14, "links": [], @@ -1415,8 +1693,8 @@ "gridPos": { "h": 7, "w": 6, - "x": 2, - "y": 22 + "x": 3, + "y": 14 }, "hideTimeOverride": true, "id": 15, @@ -1522,8 +1800,8 @@ "gridPos": { "h": 7, "w": 6, - "x": 8, - "y": 22 + "x": 9, + "y": 14 }, "hideTimeOverride": true, "id": 16, @@ -1615,39 +1893,28 @@ "type": "table" }, { - "columns": [ - { - "text": "Current", - "value": "current" - } - ], + "columns": [], "datasource": null, - "filterNull": false, + "description": "This table shows all OSDs with > 275 PG's", "fontSize": "100%", "gridPos": { "h": 7, - "w": 5, - "x": 14, - "y": 22 + "w": 9, + "x": 15, + "y": 14 }, - "id": 18, + "id": 68, "links": [], - "minSpan": 6, "pageSize": null, "scroll": true, "showHeader": true, "sort": { - "col": null, - "desc": false + "col": 5, + "desc": true }, "styles": [ { - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" - }, - { - "alias": "Object State", + "alias": "OSD", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", @@ -1656,13 +1923,13 @@ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, - "pattern": "Metric", + "pattern": "ceph_daemon", "thresholds": [], - "type": "number", + "type": "string", "unit": "short" }, { - "alias": "Count", + "alias": "Device", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", @@ -1670,130 +1937,14 @@ "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "pattern": "Current", - "thresholds": [], - "type": "number", - "unit": "none" - }, - { - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 0, - "pattern": "/.*/", + "decimals": 2, + "pattern": "device", "thresholds": [], - "type": "number", - "unit": "none" - } - ], - "targets": [ - { - "alias": "Objects", - "dsType": "influxdb", - "expr": "ceph_cluster_total_objects", - "format": "time_series", - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "intervalFactor": 1, - "legendFormat": "Total Objects", - "measurement": "collectd.obj-mon-1.storage.lab.cephmetrics.gauge.ceph.mon.num_object", - "policy": "default", - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "step": 20, - "tags": [], - "textEditor": true - }, - { - "expr": "", - "format": "table", - "intervalFactor": 1, - "legendFormat": "Objects misplaced", - "refId": "B" - }, - { - "expr": "", - "format": "table", - "intervalFactor": 1, - "legendFormat": "Objects degraded", - "refId": "C" - }, - { - "expr": "", - "format": "table", - "intervalFactor": 1, - "legendFormat": "Objects unfound", - "refId": "D" - } - ], - "title": "Object Summary", - "transform": "timeseries_aggregations", - "type": "table" - }, - { - "columns": [ - { - "text": "Current", - "value": "current" - } - ], - "datasource": null, - "filterNull": false, - "fontSize": "100%", - "gridPos": { - "h": 7, - "w": 5, - "x": 19, - "y": 22 - }, - "id": 20, - "links": [], - "minSpan": 6, - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": null, - "desc": false - }, - "styles": [ - { - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" + "type": "string", + "unit": "short" }, { - "alias": "PG State", + "alias": "Host", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", @@ -1802,76 +1953,55 @@ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, - "pattern": "Metric", + "pattern": "instance", "thresholds": [], "type": "number", "unit": "short" }, { - "alias": "Count", - "colorMode": null, + "alias": "# PGs", + "colorMode": "row", "colors": [ - "rgba(245, 54, 54, 0.9)", + "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" + "rgba(245, 54, 54, 0.9)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, - "pattern": "Current", - "thresholds": [], + "pattern": "Value", + "thresholds": [ + "200", + "250" + ], "type": "number", "unit": "none" }, { + "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], - "decimals": 0, + "decimals": 2, "pattern": "/.*/", "thresholds": [], - "type": "number", - "unit": "none" + "type": "hidden", + "unit": "short" } ], "targets": [ { - "expr": "ceph_pg_total", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "PGs", - "refId": "A", - "step": 20 - }, - { - "expr": "ceph_pg_active", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "PGs Active", - "refId": "B", - "step": 20 - }, - { - "expr": "ceph_pg_clean", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "PGs Active+clean", - "refId": "C", - "step": 20 - }, - { - "expr": "ceph_pg_peering", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "PGs peering", - "refId": "D", - "step": 20 + "expr": "(ceph_osd_numpg > 275) * on(ceph_daemon) group_left(instance,device) ceph_disk_occupation", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" } ], - "title": "PG Summary", - "transform": "timeseries_aggregations", + "title": "OSDs with High PG Counts", + "transform": "table", "type": "table" }, { @@ -1880,7 +2010,7 @@ "h": 7, "w": 2, "x": 6, - "y": 29 + "y": 21 }, "id": 17, "links": [], @@ -1896,7 +2026,7 @@ "h": 7, "w": 2, "x": 16, - "y": 29 + "y": 21 }, "id": 19, "links": [], @@ -1923,7 +2053,7 @@ { "allValue": null, "current": {}, - "datasource": null, + "datasource": "${DS_LOCAL}", "hide": 2, "includeAll": false, "label": null, @@ -1965,7 +2095,7 @@ { "allValue": null, "current": {}, - "datasource": null, + "datasource": "${DS_LOCAL}", "hide": 2, "includeAll": true, "label": null, @@ -2017,5 +2147,5 @@ "timezone": "browser", "title": "Ceph Health", "uid": "000000009", - "version": 34 + "version": 37 } \ No newline at end of file -- 2.47.3