{
- "meta": {
- "canSave": true,
- "created": "2017-08-03T21:42:28Z",
- "canStar": true,
- "expires": "0001-01-01T00:00:00Z",
- "updated": "2017-08-18T05:26:10Z",
- "slug": "alert-status",
- "version": 15,
- "createdBy": "admin",
- "updatedBy": "admin",
- "type": "db",
- "canEdit": true
- },
- "dashboard": {
- "style": "dark",
- "rows": [
- {
- "repeat": null,
- "titleSize": "h6",
- "collapse": false,
- "title": "Dashboard Row",
- "height": "250px",
- "repeatRowId": null,
- "panels": [
- {
- "span": 12,
- "stateFilter": [
- "alerting"
- ],
- "links": [],
- "show": "current",
- "title": "Active Ceph Alert List",
- "onlyAlertsOnDashboard": true,
- "limit": "20",
- "sortOrder": 3,
- "type": "alertlist",
- "id": 1
- }
+ "dashboard": {
+ "annotations": {
+ "list": []
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": true,
+ "id": 81,
+ "links": [],
+ "refresh": false,
+ "rows": [
+ {
+ "collapse": false,
+ "height": "250px",
+ "panels": [
+ {
+ "id": 1,
+ "limit": "20",
+ "links": [],
+ "onlyAlertsOnDashboard": true,
+ "show": "current",
+ "sortOrder": 3,
+ "span": 12,
+ "stateFilter": [
+ "alerting"
+ ],
+ "title": "Active Ceph Alert List",
+ "type": "alertlist"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "20s",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "keep_state",
+ "frequency": "10s",
+ "handler": 1,
+ "message": "Cluster Health is not OK",
+ "name": "Overall Ceph Health",
+ "noDataState": "no_data",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {
+ "Ceph Health": "#890F02",
+ "Ceph Health (0:OK, 4:Warning,8:Error)": "#DEDAF7",
+ "ceph health": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "The chart plots the clusters health, over time. Health is depicted as a integer; 0, 4 or 8 where 0 is OK, 4 is WARN and 8 represents an ERROR state.",
+ "fill": 1,
+ "hideTimeOverride": false,
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxDataPoints": "360",
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": true,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "alias(maxSeries(consolidateBy(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.mon.health,0)),\"max\")),\"Ceph Health\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 0
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Overall Ceph Health",
+ "tooltip": {
+ "shared": false,
+ "sort": 1,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": "10",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "1m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "keep_state",
+ "frequency": "60s",
+ "handler": 1,
+ "message": "DIsks Near full detected within the cluster. Warning threshold is 80% full.",
+ "name": "Disks Near Full",
+ "noDataState": "ok",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "This shows how many disks are at or above 80% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.",
+ "fill": 1,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "hide": true,
+ "refId": "A",
+ "target": "currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)",
+ "textEditor": true
+ },
+ {
+ "refId": "B",
+ "target": "alias(countSeries(#A),\"OSDs Near Full\")",
+ "targetFull": "alias(countSeries(currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)),\"OSDs Near Full\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 0
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Disks Near Full",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "C",
+ "30s",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "10s",
+ "handler": 1,
+ "message": "OSD Down event",
+ "name": "OSDs Down",
+ "noDataState": "ok",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": true,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "Count of OSDs currently in a DOWN state",
+ "fill": 2,
+ "hideTimeOverride": true,
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": false,
+ "linewidth": 2,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": true,
+ "targets": [
+ {
+ "hide": true,
+ "refId": "A",
+ "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\")",
+ "textEditor": true
+ },
+ {
+ "hide": true,
+ "refId": "B",
+ "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")",
+ "textEditor": true
+ },
+ {
+ "hide": false,
+ "refId": "C",
+ "target": "alias(diffSeries(#A,#B), \"down\")",
+ "targetFull": "alias(diffSeries(alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\"),alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")), \"down\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 0
+ }
+ ],
+ "timeFrom": "5m",
+ "timeShift": null,
+ "title": "OSDs Down",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 85
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "C",
+ "1h",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "avg"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "message": "Cluster Capacity Limit Warning",
+ "name": "Cluster Capacity",
+ "noDataState": "keep_state",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "This trigger raises a notification if the raw used crosses the 85% capacity threshold of the ceph cluster",
+ "fill": 1,
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "hide": true,
+ "refId": "A",
+ "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")",
+ "textEditor": true
+ },
+ {
+ "hide": true,
+ "refId": "B",
+ "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\")",
+ "textEditor": true
+ },
+ {
+ "refId": "C",
+ "target": "alias(asPercent(#B, #A), \"Raw Capacity Used %\")",
+ "targetFull": "alias(asPercent(alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\"), alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")), \"Raw Capacity Used %\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 85
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster Capacity",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "",
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "1m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "last"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "message": "PG's stuck inactive",
+ "name": "PG's Stuck",
+ "noDataState": "no_data",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "This chart shows whether there are pg's in a stuck state, that need manual intervention to resolve.",
+ "fill": 2,
+ "hideTimeOverride": true,
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "alias(maxSeries(consolidateBy(collectd.*.$domain.cephmetrics.gauge.*.mon.num_pgs_stuck, \"maxSeries\")), \"# pg's stuck inactive\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 0
+ }
+ ],
+ "timeFrom": "6h",
+ "timeShift": null,
+ "title": "PG's Stuck",
+ "tooltip": {
+ "shared": false,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+ "total"
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "lt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "5m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "min"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "message": "OSD Host Loss Free Space Check Failed",
+ "name": "OSD Host Loss Check",
+ "noDataState": "ok",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {
+ "Largest OSD Host": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "This graph checks the cluster @ 90% full is enough to support the loss of the largest OSD host",
+ "fill": 1,
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "hide": true,
+ "refId": "A",
+ "target": "alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\")",
+ "textEditor": true
+ },
+ {
+ "hide": true,
+ "refId": "B",
+ "target": "alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")",
+ "textEditor": true
+ },
+ {
+ "refId": "C",
+ "target": "alias(diffSeries(#A,#B),\"freespace after Node loss\")",
+ "targetFull": "alias(diffSeries(alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\"),alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")),\"freespace after Node loss\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "lt",
+ "value": 0
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "OSD Host Loss Check",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 1000
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "1m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "30s",
+ "handler": 1,
+ "message": "OSD Response time is > 1s",
+ "name": "Slow OSD responses alert",
+ "noDataState": "ok",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {
+ "Largest OSD Host": "#890F02"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "description": "Graph checking for OSD Latencies that are above 1s.",
+ "fill": 1,
+ "hideTimeOverride": true,
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "maxDataPoints": "",
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "aliasByNode(currentAbove(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.osd.*.perf.await,-1)),1000),1,-3)",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 1000
+ }
+ ],
+ "timeFrom": "1h",
+ "timeShift": null,
+ "title": "Slow OSD responses",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "ms",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 10
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "5m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "keep_state",
+ "frequency": "30s",
+ "handler": 1,
+ "message": "Network rx/tx issues detected",
+ "name": "Network Errors alert",
+ "noDataState": "no_data",
+ "notifications": [
+ {
+ "id": 1
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "description": "Checks all interfaces for dropped/error packets, and alerts if more than 10 are seen in a 5m interval",
+ "fill": 1,
+ "id": 11,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "groupByNode(collectd.*.$domain.interface.*.if_{dropped,errors}.*,1,\"sumSeries\")",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 10
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network Errors",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 85
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "5m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "avg"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "60s",
+ "handler": 1,
+ "name": "Pool Capacity",
+ "noDataState": "keep_state",
+ "notifications": [
+ {
+ "id": 7
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Local",
+ "fill": 5,
+ "id": 12,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.pools.*.percent_used,-2,'maxSeries')",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 85
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pool Capacity",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "alert": {
+ "conditions": [
+ {
+ "evaluator": {
+ "params": [
+ 0
+ ],
+ "type": "gt"
+ },
+ "operator": {
+ "type": "and"
+ },
+ "query": {
+ "params": [
+ "A",
+ "1m",
+ "now"
+ ]
+ },
+ "reducer": {
+ "params": [],
+ "type": "max"
+ },
+ "type": "query"
+ }
+ ],
+ "executionErrorState": "alerting",
+ "frequency": "30s",
+ "handler": 1,
+ "message": "RGW Down",
+ "name": "RGWs Down alert",
+ "noDataState": "no_data",
+ "notifications": [
+ {
+ "id": 7
+ }
+ ]
+ },
+ "aliasColors": {},
+ "bars": true,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 13,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": false,
+ "linewidth": 1,
+ "links": [],
+ "minSpan": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 2,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "refId": "A",
+ "target": "aliasByNode(countSeries(currentBelow(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.derive.*.rgw.put),-666),-0.5)),1)",
+ "textEditor": true
+ }
+ ],
+ "thresholds": [
+ {
+ "colorMode": "critical",
+ "fill": true,
+ "line": true,
+ "op": "gt",
+ "value": 0
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RGWs Down",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": "Hosts Down",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "Health Checks",
+ "titleSize": "h5"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "2017-09-19T22:02:49.497Z",
+ "to": "2017-09-19T22:10:13.739Z"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
],
- "showTitle": false,
- "repeatIteration": null
- },
- {
- "repeat": null,
- "titleSize": "h5",
- "collapse": false,
- "title": "Health Checks",
- "height": 250,
- "repeatRowId": null,
- "panels": [
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "fill": true,
- "value": 0,
- "op": "gt"
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 2,
- "steppedLine": true,
- "id": 2,
- "maxDataPoints": "360",
- "fill": 1,
- "span": 2,
- "title": "Overall Ceph Health",
- "tooltip": {
- "sort": 1,
- "shared": false,
- "value_type": "individual"
- },
- "targets": [
- {
- "textEditor": true,
- "target": "alias(maxSeries(consolidateBy(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.mon.health,0)),\"max\")),\"Ceph Health\")",
- "refId": "A"
- }
- ],
- "yaxes": [
- {
- "logBase": 1,
- "format": "short",
- "max": "10",
- "min": "0",
- "label": "",
- "show": true
- },
- {
- "logBase": 1,
- "show": false,
- "max": null,
- "format": "short",
- "label": null,
- "min": null
- }
- ],
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "The chart plots the clusters health, over time. Health is depicted as a integer; 0, 4 or 8 where 0 is OK, 4 is WARN and 8 represents an ERROR state.",
- "alert": {
- "noDataState": "no_data",
- "name": "Overall Ceph Health",
- "frequency": "10s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "keep_state",
- "message": "Cluster Health is not OK",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "20s",
- "now"
- ]
- },
- "evaluator": {
- "type": "gt",
- "params": [
- 0
- ]
- },
- "reducer": {
- "params": [],
- "type": "max"
- },
- "type": "query"
- }
- ]
- },
- "hideTimeOverride": false,
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {
- "Ceph Health (0:OK, 4:Warning,8:Error)": "#DEDAF7",
- "Ceph Health": "#890F02",
- "ceph health": "#890F02"
- },
- "lines": true,
- "legend": {
- "total": false,
- "min": false,
- "max": false,
- "show": true,
- "current": false,
- "values": false,
- "avg": false
- },
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "fill": true,
- "value": 0,
- "op": "gt"
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 1,
- "steppedLine": false,
- "targets": [
- {
- "textEditor": true,
- "hide": true,
- "target": "currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)",
- "refId": "A"
- },
- {
- "targetFull": "alias(countSeries(currentAbove(transformNull(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.osd_percent_used),0),85)),\"OSDs Near Full\")",
- "textEditor": true,
- "target": "alias(countSeries(#A),\"OSDs Near Full\")",
- "refId": "B"
- }
- ],
- "fill": 1,
- "span": 2,
- "title": "Disks Near Full",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 3,
- "yaxes": [
- {
- "logBase": 1,
- "min": "0",
- "max": null,
- "format": "short",
- "label": null,
- "show": true
- },
- {
- "logBase": 1,
- "min": null,
- "max": null,
- "format": "short",
- "show": false,
- "label": null
- }
- ],
- "xaxis": {
- "buckets": null,
- "values": [],
- "mode": "time",
- "name": null,
- "show": true
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "This shows how many disks are at or above 80% full. Performance may degrade beyond this threshold on filestore (XFS) backed OSD's.",
- "alert": {
- "noDataState": "ok",
- "name": "Disks Near Full",
- "frequency": "60s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "keep_state",
- "message": "DIsks Near full detected within the cluster. Warning threshold is 80% full.",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "1m",
- "now"
- ]
- },
- "evaluator": {
- "params": [
- 0
- ],
- "type": "gt"
- },
- "reducer": {
- "type": "max",
- "params": []
- },
- "type": "query"
- }
- ]
- },
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {},
- "lines": true,
- "legend": {
- "total": false,
- "show": false,
- "max": false,
- "min": false,
- "current": false,
- "values": false,
- "avg": false
- },
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": true,
- "timeFrom": "5m",
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "op": "gt",
- "value": 0,
- "fill": true
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 2,
- "steppedLine": true,
- "targets": [
- {
- "textEditor": true,
- "hide": true,
- "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\")",
- "refId": "A"
- },
- {
- "hide": true,
- "textEditor": true,
- "refId": "B",
- "target": "alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")"
- },
- {
- "hide": false,
- "targetFull": "alias(diffSeries(alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd), \"max\")),\"total\"),alias(keepLastValue(consolidateBy(maxSeries(collectd.*.$domain.cephmetrics.gauge.*.mon.num_osd_up), \"max\")),\"up\")), \"down\")",
- "textEditor": true,
- "refId": "C",
- "target": "alias(diffSeries(#A,#B), \"down\")"
- }
- ],
- "fill": 2,
- "span": 2,
- "title": "OSDs Down",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 4,
- "yaxes": [
- {
- "logBase": 1,
- "format": "short",
- "max": null,
- "min": "0",
- "label": null,
- "show": true
- },
- {
- "logBase": 1,
- "min": null,
- "max": null,
- "format": "short",
- "show": false,
- "label": null
- }
- ],
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "Count of OSDs currently in a DOWN state",
- "alert": {
- "noDataState": "ok",
- "name": "OSDs Down",
- "frequency": "10s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "alerting",
- "message": "OSD Down event",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "C",
- "30s",
- "now"
- ]
- },
- "evaluator": {
- "type": "gt",
- "params": [
- 0
- ]
- },
- "reducer": {
- "params": [],
- "type": "max"
- },
- "type": "query"
- }
- ]
- },
- "hideTimeOverride": true,
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {},
- "lines": false,
- "legend": {
- "rightSide": false,
- "total": false,
- "min": false,
- "max": false,
- "show": false,
- "current": false,
- "values": false,
- "alignAsTable": false,
- "avg": false,
- "hideZero": false
- },
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "op": "gt",
- "value": 85,
- "fill": true
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "stack": false,
- "linewidth": 1,
- "steppedLine": false,
- "targets": [
- {
- "hide": true,
- "textEditor": true,
- "refId": "A",
- "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")"
- },
- {
- "hide": true,
- "textEditor": true,
- "refId": "B",
- "target": "alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\")"
- },
- {
- "targetFull": "alias(asPercent(alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_used,1, \"maxSeries\")), \"Used Raw\"), alias(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes,1, \"maxSeries\")), \"Raw Capacity\")), \"Raw Capacity Used %\")",
- "textEditor": true,
- "target": "alias(asPercent(#B, #A), \"Raw Capacity Used %\")",
- "refId": "C"
- }
- ],
- "fill": 1,
- "span": 2,
- "title": "Cluster Capacity",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 5,
- "points": false,
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "This trigger raises a notification if the raw used crosses the 85% capacity threshold of the ceph cluster",
- "alert": {
- "noDataState": "keep_state",
- "name": "Cluster Capacity",
- "frequency": "60s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "alerting",
- "message": "Cluster Capacity Limit Warning",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "C",
- "1h",
- "now"
- ]
- },
- "evaluator": {
- "params": [
- 85
- ],
- "type": "gt"
- },
- "reducer": {
- "params": [],
- "type": "avg"
- },
- "type": "query"
- }
- ]
- },
- "dashLength": 10,
- "legend": {
- "total": false,
- "min": false,
- "max": false,
- "show": true,
- "current": false,
- "values": false,
- "avg": false
- },
- "timeShift": null,
- "aliasColors": {},
- "lines": true,
- "yaxes": [
- {
- "logBase": 1,
- "min": "0",
- "max": "100",
- "format": "percent",
- "show": true,
- "label": ""
- },
- {
- "logBase": 1,
- "min": null,
- "max": null,
- "format": "short",
- "show": false,
- "label": null
- }
- ],
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": "6h",
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "fill": true,
- "value": 0,
- "op": "gt"
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 2,
- "steppedLine": false,
- "targets": [
- {
- "textEditor": true,
- "refId": "A",
- "target": "alias(maxSeries(consolidateBy(collectd.*.$domain.cephmetrics.gauge.*.mon.num_pgs_stuck, \"maxSeries\")), \"# pg's stuck inactive\")"
- }
- ],
- "fill": 2,
- "span": 2,
- "title": "PG's Stuck",
- "tooltip": {
- "sort": 0,
- "shared": false,
- "value_type": "individual"
- },
- "id": 8,
- "yaxes": [
- {
- "logBase": 1,
- "min": "0",
- "max": null,
- "format": "short",
- "show": true,
- "label": null
- },
- {
- "logBase": 1,
- "show": false,
- "max": null,
- "format": "short",
- "min": null,
- "label": null
- }
- ],
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [
- "total"
- ],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "This chart shows whether there are pg's in a stuck state, that need manual intervention to resolve.",
- "alert": {
- "noDataState": "no_data",
- "name": "PG's Stuck",
- "frequency": "60s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "alerting",
- "message": "PG's stuck inactive",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "1m",
- "now"
- ]
- },
- "evaluator": {
- "params": [
- 0
- ],
- "type": "gt"
- },
- "reducer": {
- "type": "last",
- "params": []
- },
- "type": "query"
- }
- ]
- },
- "hideTimeOverride": true,
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {},
- "lines": true,
- "legend": {
- "total": false,
- "min": false,
- "max": false,
- "show": true,
- "current": false,
- "values": false,
- "avg": false
- },
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "op": "lt",
- "value": 0,
- "fill": true
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 1,
- "steppedLine": false,
- "targets": [
- {
- "hide": true,
- "textEditor": true,
- "refId": "A",
- "target": "alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\")"
- },
- {
- "textEditor": true,
- "hide": true,
- "target": "alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")",
- "refId": "B"
- },
- {
- "targetFull": "alias(diffSeries(alias(scale(maxSeries(groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.osd_bytes_avail,1, \"maxSeries\")),0.9), \"Raw Freespace\"),alias(maxSeries(groupByNode(keepLastValue(collectd.*.$domain.cephmetrics.gauge.*.osd.*.stat_bytes),1,\"sumSeries\")), \"Largest OSD Host\")),\"freespace after Node loss\")",
- "textEditor": true,
- "refId": "C",
- "target": "alias(diffSeries(#A,#B),\"freespace after Node loss\")"
- }
- ],
- "fill": 1,
- "span": 2,
- "title": "OSD Host Loss Check",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 9,
- "yaxes": [
- {
- "logBase": 1,
- "show": true,
- "max": null,
- "format": "decbytes",
- "min": "0",
- "label": null
- },
- {
- "logBase": 1,
- "show": true,
- "max": null,
- "format": "short",
- "min": null,
- "label": null
- }
- ],
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "This graph checks the cluster @ 90% full is enough to support the loss of the largest OSD host",
- "alert": {
- "noDataState": "ok",
- "name": "OSD Host Loss Check",
- "frequency": "60s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "alerting",
- "message": "OSD Host Loss Free Space Check Failed",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "5m",
- "now"
- ]
- },
- "evaluator": {
- "type": "lt",
- "params": [
- 0
- ]
- },
- "reducer": {
- "type": "min",
- "params": []
- },
- "type": "query"
- }
- ]
- },
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {
- "Largest OSD Host": "#890F02"
- },
- "lines": true,
- "legend": {
- "total": false,
- "show": false,
- "max": false,
- "min": false,
- "current": false,
- "values": false,
- "avg": false
- },
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": "1h",
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "fill": true,
- "value": 1000,
- "op": "gt"
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "stack": false,
- "linewidth": 1,
- "steppedLine": false,
- "targets": [
- {
- "textEditor": true,
- "refId": "A",
- "target": "aliasByNode(currentAbove(keepLastValue(transformNull(collectd.*.$domain.cephmetrics.gauge.*.osd.*.perf.await,-1)),1000),1,-3)"
- }
- ],
- "maxDataPoints": "",
- "fill": 1,
- "span": 2,
- "title": "Slow OSD responses",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 10,
- "yaxes": [
- {
- "logBase": 1,
- "min": "0",
- "max": null,
- "format": "none",
- "label": "ms",
- "show": true
- },
- {
- "logBase": 1,
- "format": "short",
- "max": null,
- "min": null,
- "label": null,
- "show": false
- }
- ],
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "Graph checking for OSD Latencies that are above 1s.",
- "alert": {
- "noDataState": "ok",
- "name": "Slow OSD responses alert",
- "frequency": "30s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "alerting",
- "message": "OSD Response time is > 1s",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "1m",
- "now"
- ]
- },
- "evaluator": {
- "type": "gt",
- "params": [
- 1000
- ]
- },
- "reducer": {
- "type": "max",
- "params": []
- },
- "type": "query"
- }
- ]
- },
- "hideTimeOverride": true,
- "dashLength": 10,
- "legend": {
- "total": false,
- "show": true,
- "max": false,
- "min": false,
- "current": false,
- "values": false,
- "avg": false
- },
- "timeShift": null,
- "aliasColors": {
- "Largest OSD Host": "#890F02"
- },
- "lines": true,
- "points": false,
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "op": "gt",
- "value": 10,
- "fill": true
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 1,
- "steppedLine": false,
- "id": 11,
- "fill": 1,
- "span": 2,
- "title": "Network Errors",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "targets": [
- {
- "textEditor": true,
- "target": "groupByNode(collectd.*.$domain.interface.*.if_{dropped,errors}.*,1,\"sumSeries\")",
- "refId": "A"
- }
- ],
- "points": false,
- "xaxis": {
- "buckets": null,
- "show": true,
- "values": [],
- "mode": "time",
- "name": null
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "description": "Checks all interfaces for dropped/error packets, and alerts if more than 10 are seen in a 5m interval",
- "alert": {
- "noDataState": "no_data",
- "name": "Network Errors alert",
- "frequency": "30s",
- "notifications": [
- {
- "id": 1
- }
- ],
- "handler": 1,
- "executionErrorState": "keep_state",
- "message": "Network rx/tx issues detected",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "5m",
- "now"
- ]
- },
- "evaluator": {
- "type": "gt",
- "params": [
- 10
- ]
- },
- "reducer": {
- "type": "max",
- "params": []
- },
- "type": "query"
- }
- ]
- },
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {},
- "lines": true,
- "legend": {
- "total": false,
- "min": false,
- "max": false,
- "show": false,
- "current": false,
- "values": false,
- "avg": false
- },
- "yaxes": [
- {
- "logBase": 1,
- "format": "none",
- "max": null,
- "min": "0",
- "label": null,
- "show": true
- },
- {
- "logBase": 1,
- "min": null,
- "max": null,
- "format": "short",
- "show": false,
- "label": null
- }
- ],
- "datasource": null,
- "pointradius": 5,
- "minSpan": 2
- },
- {
- "bars": false,
- "timeFrom": null,
- "links": [],
- "thresholds": [
- {
- "colorMode": "critical",
- "line": true,
- "fill": true,
- "value": 85,
- "op": "gt"
- }
- ],
- "spaceLength": 10,
- "nullPointMode": "null",
- "renderer": "flot",
- "linewidth": 2,
- "steppedLine": false,
- "targets": [
- {
- "textEditor": true,
- "refId": "A",
- "target": "groupByNode(collectd.*.$domain.cephmetrics.gauge.*.mon.pools.*.percent_used,-2,'maxSeries')"
- }
- ],
- "fill": 5,
- "span": 2,
- "title": "Pool Capacity",
- "tooltip": {
- "sort": 0,
- "shared": true,
- "value_type": "individual"
- },
- "id": 12,
- "points": false,
- "xaxis": {
- "buckets": null,
- "values": [],
- "mode": "time",
- "name": null,
- "show": true
- },
- "seriesOverrides": [],
- "percentage": false,
- "type": "graph",
- "dashes": false,
- "repeat": null,
- "alert": {
- "noDataState": "keep_state",
- "name": "Pool Capacity",
- "frequency": "60s",
- "notifications": [],
- "handler": 1,
- "executionErrorState": "alerting",
- "conditions": [
- {
- "operator": {
- "type": "and"
- },
- "query": {
- "params": [
- "A",
- "5m",
- "now"
- ]
- },
- "evaluator": {
- "params": [
- 85
- ],
- "type": "gt"
- },
- "reducer": {
- "params": [],
- "type": "avg"
- },
- "type": "query"
- }
- ]
- },
- "dashLength": 10,
- "stack": false,
- "timeShift": null,
- "aliasColors": {},
- "lines": true,
- "legend": {
- "avg": false,
- "min": false,
- "max": false,
- "show": true,
- "current": false,
- "values": false,
- "total": false
- },
- "yaxes": [
- {
- "logBase": 1,
- "format": "percent",
- "max": null,
- "min": null,
- "label": null,
- "show": true
- },
- {
- "logBase": 1,
- "show": true,
- "max": null,
- "format": "short",
- "label": null,
- "min": null
- }
- ],
- "datasource": "Local",
- "pointradius": 5,
- "minSpan": 2
- }
- ],
- "showTitle": true,
- "repeatIteration": null
- }
- ],
- "templating": {
- "list": []
- },
- "links": [],
- "tags": [],
- "graphTooltip": 0,
- "hideControls": true,
- "title": "Alert Status",
- "editable": false,
- "refresh": "10s",
- "annotations": {
- "list": []
- },
- "gnetId": null,
- "version": 15,
- "time": {
- "to": "now",
- "from": "now-1h"
- },
- "timezone": "browser",
- "schemaVersion": 14,
- "timepicker": {
- "time_options": [
- "5m",
- "15m",
- "1h",
- "6h",
- "12h",
- "24h",
- "2d",
- "7d",
- "30d"
- ],
- "refresh_intervals": [
- "5s",
- "10s",
- "30s",
- "1m",
- "5m",
- "15m",
- "30m",
- "1h",
- "2h",
- "1d"
- ]
- },
- "id": 24
- }
-}
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Alert Status",
+ "version": 7
+ },
+ "meta": {
+ "canEdit": true,
+ "canSave": true,
+ "canStar": true,
+ "created": "2017-09-19T01:06:13Z",
+ "createdBy": "admin@localhost",
+ "expires": "0001-01-01T00:00:00Z",
+ "slug": "alert-status",
+ "type": "db",
+ "updated": "2017-09-19T23:18:02Z",
+ "updatedBy": "admin@localhost",
+ "version": 7
+ }
+}
\ No newline at end of file