From: Zack Cerza Date: Fri, 1 Sep 2017 15:47:43 +0000 (-0600) Subject: Add alert for pool capacity X-Git-Tag: v1.0~26^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1c41fa11d84d79b33a820f0f59920dd072aea759;p=cephmetrics.git Add alert for pool capacity Signed-off-by: Zack Cerza --- diff --git a/dashboards/current/alert-status.json b/dashboards/current/alert-status.json index 9fbca76..4b951d5 100644 --- a/dashboards/current/alert-status.json +++ b/dashboards/current/alert-status.json @@ -1,1139 +1,1255 @@ { - "meta" : { - "expires" : "0001-01-01T00:00:00Z", - "canSave" : true, - "created" : "2017-08-03T21:42:28Z", - "canStar" : true, - "slug" : "alert-status", - "createdBy" : "admin", - "canEdit" : true, - "updated" : "2017-08-18T05:26:10Z", - "version" : 15, - "updatedBy" : "admin", - "type" : "db" - }, - "dashboard" : { - "version" : 15, - "timepicker" : { - "time_options" : [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ], - "refresh_intervals" : [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "refresh" : "10s", - "hideControls" : true, - "id" : 24, - "annotations" : { - "list" : [] - }, - "style" : "dark", - "gnetId" : null, - "timezone" : "browser", - "schemaVersion" : 14, - "time" : { - "from" : "now-1h", - "to" : "now" - }, - "rows" : [ + "meta": { + "canSave": true, + "created": "2017-08-03T21:42:28Z", + "canStar": true, + "expires": "0001-01-01T00:00:00Z", + "updated": "2017-08-18T05:26:10Z", + "slug": "alert-status", + "version": 15, + "createdBy": "admin", + "updatedBy": "admin", + "type": "db", + "canEdit": true + }, + "dashboard": { + "style": "dark", + "rows": [ { - "panels" : [ + "repeat": null, + "titleSize": "h6", + "collapse": false, + "title": "Dashboard Row", + "height": "250px", + "repeatRowId": null, + "panels": [ { - "limit" : "20", - "title" : "Active Ceph Alert List", - "span" : 12, - "id" : 1, - "sortOrder" : 3, - "onlyAlertsOnDashboard" : true, - "links" : [], - "show" : "current", - "type" : "alertlist", - "stateFilter" : [ + "span": 12, + "stateFilter": [ "alerting" - ] + ], + "links": [], + "show": "current", + "title": "Active Ceph Alert List", + "onlyAlertsOnDashboard": true, + "limit": "20", + "sortOrder": 3, + "type": "alertlist", + "id": 1 } - ], - "repeatIteration" : null, - "repeat" : null, - "showTitle" : false, - "collapse" : false, - "title" : "Dashboard Row", - "repeatRowId" : null, - "height" : "250px", - "titleSize" : "h6" - }, + ], + "showTitle": false, + "repeatIteration": null + }, { - "height" : 250, - "titleSize" : "h5", - "title" : "Health Checks", - "collapse" : false, - "repeatRowId" : null, - "showTitle" : true, - "panels" : [ + "repeat": null, + "titleSize": "h5", + "collapse": false, + "title": "Health Checks", + "height": 250, + "repeatRowId": null, + "panels": [ { - "legend" : { - "min" : false, - "values" : false, - "current" : false, - "show" : true, - "total" : false, - "avg" : false, - "max" : false - }, - "dashes" : false, - "hideTimeOverride" : false, - "percentage" : false, - "maxDataPoints" : "360", - "alert" : { - "notifications" : [ - { - "id" : 1 - } - ], - "name" : "Overall Ceph Health", - "conditions" : [ - { - "query" : { - "params" : [ - "A", - "20s", - "now" - ] - }, - "reducer" : { - "params" : [], - "type" : "last" - }, - "operator" : { - "type" : "and" - }, - "evaluator" : { - "type" : "gt", - "params" : [ - 0 - ] - }, - "type" : "query" - } - ], - "message" : "Cluster Health is not OK", - "handler" : 1, - "frequency" : "10s", - "executionErrorState" : "keep_state", - "noDataState" : "no_data" - }, - "id" : 2, - "points" : false, - "spaceLength" : 10, - "renderer" : "flot", - "minSpan" : 2, - "dashLength" : 10, - "pointradius" : 5, - "xaxis" : { - "values" : [], - "name" : null, - "show" : true, - "buckets" : null, - "mode" : "time" - }, - "yaxes" : [ + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "format" : "short", - "max" : "10", - "label" : "", - "show" : true, - "min" : "0", - "logBase" : 1 - }, - { - "max" : null, - "format" : "short", - "show" : false, - "label" : null, - "logBase" : 1, - "min" : null + "colorMode": "critical", + "line": true, + "fill": true, + "value": 0, + "op": "gt" } - ], - "bars" : false, - "description" : "The chart plots the clusters health, over time. Health is depicted as a integer; 0, 4 or 8 where 0 is OK, 4 is WARN and 8 represents an ERROR state.", - "thresholds" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 2, + "steppedLine": true, + "id": 2, + "maxDataPoints": "360", + "fill": 1, + "span": 2, + "title": "Overall Ceph Health", + "tooltip": { + "sort": 1, + "shared": false, + "value_type": "individual" + }, + "targets": [ { - "fill" : true, - "colorMode" : "critical", - "op" : "gt", - "line" : true, - "value" : 0 + "textEditor": true, + "target": "alias(maxSeries(consolidateBy(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.mon.health,0)),\"max\")),\"Ceph Health\")", + "refId": "A" } - ], - "tooltip" : { - "shared" : false, - "value_type" : "individual", - "sort" : 1 - }, - "targets" : [ + ], + "yaxes": [ + { + "logBase": 1, + "format": "short", + "max": "10", + "min": "0", + "label": "", + "show": true + }, { - "target" : "alias(maxSeries(consolidateBy(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.mon.health,0)),\"max\")),\"Ceph Health\")", - "textEditor" : true, - "refId" : "A" + "logBase": 1, + "show": false, + "max": null, + "format": "short", + "label": null, + "min": null } - ], - "linewidth" : 2, - "stack" : false, - "title" : "Overall Ceph Health", - "nullPointMode" : "null", - "span" : 2, - "seriesOverrides" : [], - "lines" : true, - "datasource" : "Local", - "fill" : 1, - "timeShift" : null, - "links" : [], - "type" : "graph", - "timeFrom" : null, - "steppedLine" : true, - "aliasColors" : { - "Ceph Health" : "#890F02", - "Ceph Health (0:OK, 4:Warning,8:Error)" : "#DEDAF7", - "ceph health" : "#890F02" - } - }, - { - "percentage" : false, - "dashes" : false, - "legend" : { - "current" : false, - "show" : false, - "min" : false, - "values" : false, - "max" : false, - "total" : false, - "avg" : false - }, - "spaceLength" : 10, - "id" : 3, - "points" : false, - "alert" : { - "name" : "Disks Near Full", - "notifications" : [ + ], + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "The chart plots the clusters health, over time. Health is depicted as a integer; 0, 4 or 8 where 0 is OK, 4 is WARN and 8 represents an ERROR state.", + "alert": { + "noDataState": "no_data", + "name": "Overall Ceph Health", + "frequency": "10s", + "notifications": [ { - "id" : 1 + "id": 1 } - ], - "message" : "DIsks Near full detected within the cluster. Warning threshold is 80% full.", - "conditions" : [ + ], + "handler": 1, + "executionErrorState": "keep_state", + "message": "Cluster Health is not OK", + "conditions": [ { - "operator" : { - "type" : "and" - }, - "reducer" : { - "type" : "max", - "params" : [] - }, - "query" : { - "params" : [ - "A", - "1m", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "20s", "now" ] - }, - "type" : "query", - "evaluator" : { - "params" : [ + }, + "evaluator": { + "type": "gt", + "params": [ 0 - ], - "type" : "gt" - } + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" } - ], - "frequency" : "60s", - "executionErrorState" : "keep_state", - "handler" : 1, - "noDataState" : "ok" - }, - "pointradius" : 5, - "renderer" : "flot", - "minSpan" : 2, - "dashLength" : 10, - "yaxes" : [ - { - "min" : "0", - "logBase" : 1, - "label" : null, - "show" : true, - "format" : "short", - "max" : null - }, + ] + }, + "hideTimeOverride": false, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": { + "Ceph Health (0:OK, 4:Warning,8:Error)": "#DEDAF7", + "Ceph Health": "#890F02", + "ceph health": "#890F02" + }, + "lines": true, + "legend": { + "total": false, + "min": false, + "max": false, + "show": true, + "current": false, + "values": false, + "avg": false + }, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, + { + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "label" : null, - "show" : false, - "min" : null, - "logBase" : 1, - "format" : "short", - "max" : null + "colorMode": "critical", + "line": true, + "fill": true, + "value": 0, + "op": "gt" } - ], - "xaxis" : { - "mode" : "time", - "values" : [], - "name" : null, - "show" : true, - "buckets" : null - }, - "description" : "This shows how many disks are at or above 80% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.", - "bars" : false, - "tooltip" : { - "value_type" : "individual", - "sort" : 0, - "shared" : true - }, - "targets" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 1, + "steppedLine": false, + "targets": [ { - "textEditor" : true, - "target" : "currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)", - "refId" : "A", - "hide" : true - }, + "textEditor": true, + "hide": true, + "target": "currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)", + "refId": "A" + }, { - "target" : "alias(countSeries(#A),\"OSDs Near Full\")", - "textEditor" : true, - "refId" : "B", - "targetFull" : "alias(countSeries(currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)),\"OSDs Near Full\")" + "targetFull": "alias(countSeries(currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)),\"OSDs Near Full\")", + "textEditor": true, + "target": "alias(countSeries(#A),\"OSDs Near Full\")", + "refId": "B" } - ], - "linewidth" : 1, - "thresholds" : [ + ], + "fill": 1, + "span": 2, + "title": "Disks Near Full", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 3, + "yaxes": [ + { + "logBase": 1, + "min": "0", + "max": null, + "format": "short", + "label": null, + "show": true + }, { - "colorMode" : "critical", - "fill" : true, - "value" : 0, - "op" : "gt", - "line" : true + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "show": false, + "label": null } - ], - "datasource" : "Local", - "lines" : true, - "title" : "Disks Near Full", - "stack" : false, - "nullPointMode" : "null", - "span" : 2, - "seriesOverrides" : [], - "aliasColors" : {}, - "steppedLine" : false, - "timeShift" : null, - "links" : [], - "fill" : 1, - "type" : "graph", - "timeFrom" : null - }, - { - "hideTimeOverride" : true, - "percentage" : false, - "dashes" : false, - "legend" : { - "current" : false, - "min" : false, - "alignAsTable" : false, - "max" : false, - "total" : false, - "show" : false, - "rightSide" : false, - "values" : false, - "avg" : false, - "hideZero" : false - }, - "spaceLength" : 10, - "points" : false, - "id" : 4, - "alert" : { - "name" : "OSDs Down", - "notifications" : [ + ], + "xaxis": { + "buckets": null, + "values": [], + "mode": "time", + "name": null, + "show": true + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "This shows how many disks are at or above 80% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.", + "alert": { + "noDataState": "ok", + "name": "Disks Near Full", + "frequency": "60s", + "notifications": [ { - "id" : 1 + "id": 1 } - ], - "conditions" : [ + ], + "handler": 1, + "executionErrorState": "keep_state", + "message": "DIsks Near full detected within the cluster. Warning threshold is 80% full.", + "conditions": [ { - "evaluator" : { - "type" : "gt", - "params" : [ - 0 - ] - }, - "type" : "query", - "query" : { - "params" : [ - "C", - "30s", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1m", "now" ] - }, - "reducer" : { - "params" : [], - "type" : "max" - }, - "operator" : { - "type" : "and" - } + }, + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "reducer": { + "type": "max", + "params": [] + }, + "type": "query" } - ], - "message" : "OSD Down event", - "handler" : 1, - "executionErrorState" : "alerting", - "frequency" : "10s", - "noDataState" : "ok" - }, - "pointradius" : 5, - "dashLength" : 10, - "minSpan" : 2, - "renderer" : "flot", - "yaxes" : [ - { - "format" : "short", - "max" : null, - "label" : null, - "show" : true, - "min" : "0", - "logBase" : 1 - }, + ] + }, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": {}, + "lines": true, + "legend": { + "total": false, + "show": false, + "max": false, + "min": false, + "current": false, + "values": false, + "avg": false + }, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, + { + "bars": true, + "timeFrom": "5m", + "links": [], + "thresholds": [ { - "logBase" : 1, - "min" : null, - "show" : false, - "label" : null, - "max" : null, - "format" : "short" + "colorMode": "critical", + "line": true, + "op": "gt", + "value": 0, + "fill": true } - ], - "xaxis" : { - "mode" : "time", - "show" : true, - "buckets" : null, - "name" : null, - "values" : [] - }, - "description" : "Count of OSDs currently in a DOWN state", - "bars" : true, - "linewidth" : 2, - "tooltip" : { - "shared" : true, - "value_type" : "individual", - "sort" : 0 - }, - "targets" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 2, + "steppedLine": true, + "targets": [ { - "target" : "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\")", - "textEditor" : true, - "hide" : true, - "refId" : "A" - }, + "textEditor": true, + "hide": true, + "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\")", + "refId": "A" + }, { - "hide" : true, - "refId" : "B", - "target" : "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")", - "textEditor" : true - }, + "hide": true, + "textEditor": true, + "refId": "B", + "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")" + }, { - "targetFull" : "alias(diffSeries(alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\"),alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")), \"down\")", - "refId" : "C", - "hide" : false, - "textEditor" : true, - "target" : "alias(diffSeries(#A,#B), \"down\")" + "hide": false, + "targetFull": "alias(diffSeries(alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\"),alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")), \"down\")", + "textEditor": true, + "refId": "C", + "target": "alias(diffSeries(#A,#B), \"down\")" } - ], - "thresholds" : [ + ], + "fill": 2, + "span": 2, + "title": "OSDs Down", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 4, + "yaxes": [ + { + "logBase": 1, + "format": "short", + "max": null, + "min": "0", + "label": null, + "show": true + }, { - "op" : "gt", - "line" : true, - "value" : 0, - "fill" : true, - "colorMode" : "critical" + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "show": false, + "label": null } - ], - "datasource" : "Local", - "lines" : false, - "nullPointMode" : "null", - "title" : "OSDs Down", - "stack" : false, - "span" : 2, - "seriesOverrides" : [], - "steppedLine" : true, - "aliasColors" : {}, - "timeFrom" : "5m", - "type" : "graph", - "fill" : 2, - "timeShift" : null, - "links" : [] - }, + ], + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "Count of OSDs currently in a DOWN state", + "alert": { + "noDataState": "ok", + "name": "OSDs Down", + "frequency": "10s", + "notifications": [ + { + "id": 1 + } + ], + "handler": 1, + "executionErrorState": "alerting", + "message": "OSD Down event", + "conditions": [ + { + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C", + "30s", + "now" + ] + }, + "evaluator": { + "type": "gt", + "params": [ + 0 + ] + }, + "reducer": { + "params": [], + "type": "max" + }, + "type": "query" + } + ] + }, + "hideTimeOverride": true, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": {}, + "lines": false, + "legend": { + "rightSide": false, + "total": false, + "min": false, + "max": false, + "show": false, + "current": false, + "values": false, + "alignAsTable": false, + "avg": false, + "hideZero": false + }, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, { - "bars" : false, - "description" : "This trigger raises a notification if the raw used crosses the 85% capacity threshold of the ceph cluster", - "thresholds" : [ + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "op" : "gt", - "line" : true, - "value" : 85, - "fill" : true, - "colorMode" : "critical" + "colorMode": "critical", + "line": true, + "op": "gt", + "value": 85, + "fill": true } - ], - "linewidth" : 1, - "targets" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "stack": false, + "linewidth": 1, + "steppedLine": false, + "targets": [ { - "hide" : true, - "refId" : "A", - "textEditor" : true, - "target" : "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")" - }, + "hide": true, + "textEditor": true, + "refId": "A", + "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")" + }, { - "refId" : "B", - "hide" : true, - "textEditor" : true, - "target" : "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\")" - }, + "hide": true, + "textEditor": true, + "refId": "B", + "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\")" + }, { - "target" : "alias(asPercent(#B, #A), \"Raw Capacity Used %\")", - "textEditor" : true, - "targetFull" : "alias(asPercent(alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\"), alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")), \"Raw Capacity Used %\")", - "refId" : "C" + "targetFull": "alias(asPercent(alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\"), alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")), \"Raw Capacity Used %\")", + "textEditor": true, + "target": "alias(asPercent(#B, #A), \"Raw Capacity Used %\")", + "refId": "C" } - ], - "tooltip" : { - "shared" : true, - "value_type" : "individual", - "sort" : 0 - }, - "title" : "Cluster Capacity", - "stack" : false, - "seriesOverrides" : [], - "span" : 2, - "nullPointMode" : "null", - "lines" : true, - "datasource" : "Local", - "timeFrom" : null, - "type" : "graph", - "timeShift" : null, - "links" : [], - "fill" : 1, - "aliasColors" : {}, - "steppedLine" : false, - "legend" : { - "max" : false, - "avg" : false, - "total" : false, - "show" : true, - "current" : false, - "values" : false, - "min" : false - }, - "dashes" : false, - "percentage" : false, - "alert" : { - "name" : "Cluster Capacity", - "notifications" : [ + ], + "fill": 1, + "span": 2, + "title": "Cluster Capacity", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 5, + "points": false, + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "This trigger raises a notification if the raw used crosses the 85% capacity threshold of the ceph cluster", + "alert": { + "noDataState": "keep_state", + "name": "Cluster Capacity", + "frequency": "60s", + "notifications": [ { - "id" : 1 + "id": 1 } - ], - "message" : "Cluster Capacity Limit Warning", - "conditions" : [ + ], + "handler": 1, + "executionErrorState": "alerting", + "message": "Cluster Capacity Limit Warning", + "conditions": [ { - "evaluator" : { - "params" : [ - 85 - ], - "type" : "gt" - }, - "type" : "query", - "query" : { - "params" : [ - "C", - "1h", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C", + "1h", "now" ] - }, - "reducer" : { - "params" : [], - "type" : "avg" - }, - "operator" : { - "type" : "and" - } + }, + "evaluator": { + "params": [ + 85 + ], + "type": "gt" + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" } - ], - "executionErrorState" : "alerting", - "frequency" : "60s", - "handler" : 1, - "noDataState" : "keep_state" - }, - "points" : false, - "id" : 5, - "spaceLength" : 10, - "minSpan" : 2, - "dashLength" : 10, - "renderer" : "flot", - "pointradius" : 5, - "xaxis" : { - "values" : [], - "name" : null, - "show" : true, - "buckets" : null, - "mode" : "time" - }, - "yaxes" : [ + ] + }, + "dashLength": 10, + "legend": { + "total": false, + "min": false, + "max": false, + "show": true, + "current": false, + "values": false, + "avg": false + }, + "timeShift": null, + "aliasColors": {}, + "lines": true, + "yaxes": [ { - "label" : "", - "show" : true, - "min" : "0", - "logBase" : 1, - "format" : "percent", - "max" : "100" - }, + "logBase": 1, + "min": "0", + "max": "100", + "format": "percent", + "show": true, + "label": "" + }, { - "logBase" : 1, - "min" : null, - "show" : false, - "label" : null, - "max" : null, - "format" : "short" + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "show": false, + "label": null } - ] - }, + ], + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, { - "alert" : { - "executionErrorState" : "alerting", - "frequency" : "60s", - "handler" : 1, - "noDataState" : "no_data", - "notifications" : [ + "bars": false, + "timeFrom": "6h", + "links": [], + "thresholds": [ + { + "colorMode": "critical", + "line": true, + "fill": true, + "value": 0, + "op": "gt" + } + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 2, + "steppedLine": false, + "targets": [ + { + "textEditor": true, + "refId": "A", + "target": "alias(maxSeries(consolidateBy(collectd.*.$domain.cephmetrics.gauge.*.mon.num_pgs_stuck, \"maxSeries\")), \"# pg's stuck inactive\")" + } + ], + "fill": 2, + "span": 2, + "title": "PG's Stuck", + "tooltip": { + "sort": 0, + "shared": false, + "value_type": "individual" + }, + "id": 8, + "yaxes": [ + { + "logBase": 1, + "min": "0", + "max": null, + "format": "short", + "show": true, + "label": null + }, + { + "logBase": 1, + "show": false, + "max": null, + "format": "short", + "min": null, + "label": null + } + ], + "xaxis": { + "buckets": null, + "show": true, + "values": [ + "total" + ], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "This chart shows whether there are pg's in a stuck state, that need manual intervention to resolve.", + "alert": { + "noDataState": "no_data", + "name": "PG's Stuck", + "frequency": "60s", + "notifications": [ { - "id" : 1 + "id": 1 } - ], - "name" : "PG's Stuck", - "message" : "PG's stuck inactive", - "conditions" : [ + ], + "handler": 1, + "executionErrorState": "alerting", + "message": "PG's stuck inactive", + "conditions": [ { - "evaluator" : { - "params" : [ - 0 - ], - "type" : "gt" - }, - "type" : "query", - "query" : { - "params" : [ - "A", - "1m", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1m", "now" ] - }, - "reducer" : { - "type" : "last", - "params" : [] - }, - "operator" : { - "type" : "and" - } + }, + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "reducer": { + "type": "last", + "params": [] + }, + "type": "query" } ] - }, - "points" : false, - "id" : 8, - "spaceLength" : 10, - "legend" : { - "max" : false, - "avg" : false, - "total" : false, - "show" : true, - "current" : false, - "values" : false, - "min" : false - }, - "dashes" : false, - "percentage" : false, - "hideTimeOverride" : true, - "xaxis" : { - "mode" : "time", - "show" : true, - "buckets" : null, - "name" : null, - "values" : [ - "total" - ] - }, - "yaxes" : [ - { - "logBase" : 1, - "min" : "0", - "show" : true, - "label" : null, - "max" : null, - "format" : "short" - }, + }, + "hideTimeOverride": true, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": {}, + "lines": true, + "legend": { + "total": false, + "min": false, + "max": false, + "show": true, + "current": false, + "values": false, + "avg": false + }, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, + { + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "show" : false, - "label" : null, - "logBase" : 1, - "min" : null, - "max" : null, - "format" : "short" + "colorMode": "critical", + "line": true, + "op": "lt", + "value": 0, + "fill": true } - ], - "dashLength" : 10, - "minSpan" : 2, - "renderer" : "flot", - "pointradius" : 5, - "thresholds" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 1, + "steppedLine": false, + "targets": [ { - "fill" : true, - "colorMode" : "critical", - "line" : true, - "op" : "gt", - "value" : 0 - } - ], - "linewidth" : 2, - "tooltip" : { - "shared" : false, - "sort" : 0, - "value_type" : "individual" - }, - "targets" : [ + "hide": true, + "textEditor": true, + "refId": "A", + "target": "alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\")" + }, { - "refId" : "A", - "target" : "alias(maxSeries(consolidateBy(collectd.*.$domain.cephmetrics.gauge.*.mon.num_pgs_stuck, \"maxSeries\")), \"# pg's stuck inactive\")", - "textEditor" : true + "textEditor": true, + "hide": true, + "target": "alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")", + "refId": "B" + }, + { + "targetFull": "alias(diffSeries(alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\"),alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")),\"freespace after Node loss\")", + "textEditor": true, + "refId": "C", + "target": "alias(diffSeries(#A,#B),\"freespace after Node loss\")" } - ], - "bars" : false, - "description" : "This chart shows whether there are pg's in a stuck state, that need manual intervention to resolve.", - "timeFrom" : "6h", - "type" : "graph", - "links" : [], - "timeShift" : null, - "fill" : 2, - "aliasColors" : {}, - "steppedLine" : false, - "span" : 2, - "title" : "PG's Stuck", - "stack" : false, - "seriesOverrides" : [], - "nullPointMode" : "null", - "lines" : true, - "datasource" : "Local" - }, - { - "xaxis" : { - "values" : [], - "name" : null, - "show" : true, - "buckets" : null, - "mode" : "time" - }, - "yaxes" : [ + ], + "fill": 1, + "span": 2, + "title": "OSD Host Loss Check", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 9, + "yaxes": [ { - "show" : true, - "label" : null, - "logBase" : 1, - "min" : "0", - "max" : null, - "format" : "decbytes" - }, + "logBase": 1, + "show": true, + "max": null, + "format": "decbytes", + "min": "0", + "label": null + }, { - "show" : true, - "label" : null, - "logBase" : 1, - "min" : null, - "max" : null, - "format" : "short" + "logBase": 1, + "show": true, + "max": null, + "format": "short", + "min": null, + "label": null } - ], - "minSpan" : 2, - "dashLength" : 10, - "renderer" : "flot", - "pointradius" : 5, - "alert" : { - "noDataState" : "ok", - "handler" : 1, - "executionErrorState" : "alerting", - "frequency" : "60s", - "conditions" : [ + ], + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "This graph checks the cluster @ 90% full is enough to support the loss of the largest OSD host", + "alert": { + "noDataState": "ok", + "name": "OSD Host Loss Check", + "frequency": "60s", + "notifications": [ + { + "id": 1 + } + ], + "handler": 1, + "executionErrorState": "alerting", + "message": "OSD Host Loss Free Space Check Failed", + "conditions": [ { - "reducer" : { - "type" : "min", - "params" : [] - }, - "operator" : { - "type" : "and" - }, - "query" : { - "params" : [ - "A", - "5m", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", "now" ] - }, - "type" : "query", - "evaluator" : { - "type" : "lt", - "params" : [ + }, + "evaluator": { + "type": "lt", + "params": [ 0 ] - } - } - ], - "message" : "OSD Host Loss Free Space Check Failed", - "name" : "OSD Host Loss Check", - "notifications" : [ - { - "id" : 1 + }, + "reducer": { + "type": "min", + "params": [] + }, + "type": "query" } ] - }, - "spaceLength" : 10, - "points" : false, - "id" : 9, - "dashes" : false, - "legend" : { - "current" : false, - "show" : false, - "min" : false, - "values" : false, - "max" : false, - "total" : false, - "avg" : false - }, - "percentage" : false, - "aliasColors" : { - "Largest OSD Host" : "#890F02" - }, - "steppedLine" : false, - "timeFrom" : null, - "type" : "graph", - "timeShift" : null, - "links" : [], - "fill" : 1, - "lines" : true, - "seriesOverrides" : [], - "title" : "OSD Host Loss Check", - "stack" : false, - "span" : 2, - "nullPointMode" : "null", - "datasource" : "Local", - "thresholds" : [ + }, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": { + "Largest OSD Host": "#890F02" + }, + "lines": true, + "legend": { + "total": false, + "show": false, + "max": false, + "min": false, + "current": false, + "values": false, + "avg": false + }, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, + { + "bars": false, + "timeFrom": "1h", + "links": [], + "thresholds": [ { - "line" : true, - "op" : "lt", - "value" : 0, - "fill" : true, - "colorMode" : "critical" + "colorMode": "critical", + "line": true, + "fill": true, + "value": 1000, + "op": "gt" } - ], - "linewidth" : 1, - "tooltip" : { - "shared" : true, - "sort" : 0, - "value_type" : "individual" - }, - "targets" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "stack": false, + "linewidth": 1, + "steppedLine": false, + "targets": [ { - "refId" : "A", - "hide" : true, - "textEditor" : true, - "target" : "alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\")" - }, + "textEditor": true, + "refId": "A", + "target": "aliasByNode(currentAbove(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.osd.*.perf.await,-1)),1000),1,-3)" + } + ], + "maxDataPoints": "", + "fill": 1, + "span": 2, + "title": "Slow OSD responses", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 10, + "yaxes": [ { - "target" : "alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")", - "textEditor" : true, - "refId" : "B", - "hide" : true - }, + "logBase": 1, + "min": "0", + "max": null, + "format": "none", + "label": "ms", + "show": true + }, { - "targetFull" : "alias(diffSeries(alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\"),alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")),\"freespace after Node loss\")", - "refId" : "C", - "target" : "alias(diffSeries(#A,#B),\"freespace after Node loss\")", - "textEditor" : true + "logBase": 1, + "format": "short", + "max": null, + "min": null, + "label": null, + "show": false } - ], - "bars" : false, - "description" : "This graph checks the cluster @ 90% full is enough to support the loss of the largest OSD host" - }, + ], + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "Graph checking for OSD Latencies that are above 1s.", + "alert": { + "noDataState": "ok", + "name": "Slow OSD responses alert", + "frequency": "30s", + "notifications": [ + { + "id": 1 + } + ], + "handler": 1, + "executionErrorState": "alerting", + "message": "OSD Response time is > 1s", + "conditions": [ + { + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1m", + "now" + ] + }, + "evaluator": { + "type": "gt", + "params": [ + 1000 + ] + }, + "reducer": { + "type": "max", + "params": [] + }, + "type": "query" + } + ] + }, + "hideTimeOverride": true, + "dashLength": 10, + "legend": { + "total": false, + "show": true, + "max": false, + "min": false, + "current": false, + "values": false, + "avg": false + }, + "timeShift": null, + "aliasColors": { + "Largest OSD Host": "#890F02" + }, + "lines": true, + "points": false, + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 + }, { - "renderer" : "flot", - "minSpan" : 2, - "dashLength" : 10, - "pointradius" : 5, - "xaxis" : { - "mode" : "time", - "show" : true, - "buckets" : null, - "values" : [], - "name" : null - }, - "yaxes" : [ + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "min" : "0", - "logBase" : 1, - "label" : "ms", - "show" : true, - "format" : "none", - "max" : null - }, + "colorMode": "critical", + "line": true, + "op": "gt", + "value": 10, + "fill": true + } + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 1, + "steppedLine": false, + "id": 11, + "fill": 1, + "span": 2, + "title": "Network Errors", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "targets": [ { - "format" : "short", - "max" : null, - "min" : null, - "logBase" : 1, - "label" : null, - "show" : false + "textEditor": true, + "target": "groupByNode(collectd.*.$domain.interface.*.if_{dropped,errors}.*,1,\"sumSeries\")", + "refId": "A" } - ], - "legend" : { - "max" : false, - "total" : false, - "avg" : false, - "current" : false, - "show" : true, - "min" : false, - "values" : false - }, - "dashes" : false, - "hideTimeOverride" : true, - "percentage" : false, - "maxDataPoints" : "", - "alert" : { - "executionErrorState" : "alerting", - "frequency" : "30s", - "handler" : 1, - "noDataState" : "ok", - "name" : "Slow OSD responses alert", - "notifications" : [ + ], + "points": false, + "xaxis": { + "buckets": null, + "show": true, + "values": [], + "mode": "time", + "name": null + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "description": "Checks all interfaces for dropped/error packets, and alerts if more than 10 are seen in a 5m interval", + "alert": { + "noDataState": "no_data", + "name": "Network Errors alert", + "frequency": "30s", + "notifications": [ { - "id" : 1 + "id": 1 } - ], - "message" : "OSD Response time is > 1s", - "conditions" : [ + ], + "handler": 1, + "executionErrorState": "keep_state", + "message": "Network rx/tx issues detected", + "conditions": [ { - "evaluator" : { - "type" : "gt", - "params" : [ - 1000 - ] - }, - "type" : "query", - "query" : { - "params" : [ - "A", - "1m", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", "now" ] - }, - "operator" : { - "type" : "and" - }, - "reducer" : { - "type" : "max", - "params" : [] - } + }, + "evaluator": { + "type": "gt", + "params": [ + 10 + ] + }, + "reducer": { + "type": "max", + "params": [] + }, + "type": "query" } ] - }, - "id" : 10, - "points" : false, - "spaceLength" : 10, - "title" : "Slow OSD responses", - "stack" : false, - "nullPointMode" : "null", - "span" : 2, - "seriesOverrides" : [], - "lines" : true, - "datasource" : "Local", - "timeShift" : null, - "fill" : 1, - "links" : [], - "timeFrom" : "1h", - "type" : "graph", - "aliasColors" : { - "Largest OSD Host" : "#890F02" - }, - "steppedLine" : false, - "bars" : false, - "description" : "Graph checking for OSD Latencies that are above 1s.", - "thresholds" : [ + }, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": {}, + "lines": true, + "legend": { + "total": false, + "min": false, + "max": false, + "show": false, + "current": false, + "values": false, + "avg": false + }, + "yaxes": [ { - "fill" : true, - "colorMode" : "critical", - "line" : true, - "op" : "gt", - "value" : 1000 - } - ], - "targets" : [ + "logBase": 1, + "format": "none", + "max": null, + "min": "0", + "label": null, + "show": true + }, { - "refId" : "A", - "textEditor" : true, - "target" : "aliasByNode(currentAbove(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.osd.*.perf.await,-1)),1000),1,-3)" + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "show": false, + "label": null } - ], - "tooltip" : { - "sort" : 0, - "value_type" : "individual", - "shared" : true - }, - "linewidth" : 1 - }, + ], + "datasource": null, + "pointradius": 5, + "minSpan": 2 + }, { - "thresholds" : [ + "bars": false, + "timeFrom": null, + "links": [], + "thresholds": [ { - "value" : 10, - "op" : "gt", - "line" : true, - "colorMode" : "critical", - "fill" : true + "colorMode": "critical", + "line": true, + "fill": true, + "value": 85, + "op": "gt" } - ], - "linewidth" : 1, - "tooltip" : { - "shared" : true, - "sort" : 0, - "value_type" : "individual" - }, - "targets" : [ + ], + "spaceLength": 10, + "nullPointMode": "null", + "renderer": "flot", + "linewidth": 2, + "steppedLine": false, + "targets": [ { - "target" : "groupByNode(collectd.*.$domain.interface.*.if_{dropped,errors}.*,1,\"sumSeries\")", - "textEditor" : true, - "refId" : "A" + "textEditor": true, + "refId": "A", + "target": "groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.pools.*.percent_used,-2,'maxSeries')" } - ], - "bars" : false, - "description" : "Checks all interfaces for dropped/error packets, and alerts if more than 10 are seen in a 5m interval", - "aliasColors" : {}, - "steppedLine" : false, - "type" : "graph", - "timeFrom" : null, - "fill" : 1, - "timeShift" : null, - "links" : [], - "lines" : true, - "span" : 2, - "title" : "Network Errors", - "stack" : false, - "seriesOverrides" : [], - "nullPointMode" : "null", - "datasource" : null, - "alert" : { - "handler" : 1, - "frequency" : "30s", - "executionErrorState" : "keep_state", - "noDataState" : "no_data", - "notifications" : [ - { - "id" : 1 - } - ], - "name" : "Network Errors alert", - "conditions" : [ + ], + "fill": 5, + "span": 2, + "title": "Pool Capacity", + "tooltip": { + "sort": 0, + "shared": true, + "value_type": "individual" + }, + "id": 12, + "points": false, + "xaxis": { + "buckets": null, + "values": [], + "mode": "time", + "name": null, + "show": true + }, + "seriesOverrides": [], + "percentage": false, + "type": "graph", + "dashes": false, + "repeat": null, + "alert": { + "noDataState": "keep_state", + "name": "Pool Capacity", + "frequency": "60s", + "notifications": [], + "handler": 1, + "executionErrorState": "alerting", + "conditions": [ { - "query" : { - "params" : [ - "A", - "5m", + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", "now" ] - }, - "reducer" : { - "type" : "max", - "params" : [] - }, - "operator" : { - "type" : "and" - }, - "evaluator" : { - "type" : "gt", - "params" : [ - 10 - ] - }, - "type" : "query" + }, + "evaluator": { + "params": [ + 85 + ], + "type": "gt" + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" } - ], - "message" : "Network rx/tx issues detected" - }, - "spaceLength" : 10, - "points" : false, - "id" : 11, - "dashes" : false, - "legend" : { - "min" : false, - "values" : false, - "current" : false, - "show" : false, - "total" : false, - "avg" : false, - "max" : false - }, - "percentage" : false, - "xaxis" : { - "show" : true, - "buckets" : null, - "name" : null, - "values" : [], - "mode" : "time" - }, - "yaxes" : [ + ] + }, + "dashLength": 10, + "stack": false, + "timeShift": null, + "aliasColors": {}, + "lines": true, + "legend": { + "avg": false, + "min": false, + "max": false, + "show": true, + "current": false, + "values": false, + "total": false + }, + "yaxes": [ { - "format" : "none", - "max" : null, - "label" : null, - "show" : true, - "min" : "0", - "logBase" : 1 - }, + "logBase": 1, + "format": "percent", + "max": null, + "min": null, + "label": null, + "show": true + }, { - "logBase" : 1, - "min" : null, - "show" : false, - "label" : null, - "max" : null, - "format" : "short" + "logBase": 1, + "show": true, + "max": null, + "format": "short", + "label": null, + "min": null } - ], - "minSpan" : 2, - "dashLength" : 10, - "renderer" : "flot", - "pointradius" : 5 + ], + "datasource": "Local", + "pointradius": 5, + "minSpan": 2 } - ], - "repeatIteration" : null, - "repeat" : null + ], + "showTitle": true, + "repeatIteration": null } - ], - "graphTooltip" : 0, - "links" : [], - "templating" : { - "list" : [] - }, - "tags" : [], - "editable" : false, - "title" : "Alert Status" + ], + "templating": { + "list": [] + }, + "links": [], + "tags": [], + "graphTooltip": 0, + "hideControls": true, + "title": "Alert Status", + "editable": false, + "refresh": "10s", + "annotations": { + "list": [] + }, + "gnetId": null, + "version": 15, + "time": { + "to": "now", + "from": "now-1h" + }, + "timezone": "browser", + "schemaVersion": 14, + "timepicker": { + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ], + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "id": 24 } }