mgr/prometheus: Add healthcheck metric for SLOW_OPS

author Paul Cuzner <pcuzner@redhat.com>

Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)

committer Paul Cuzner <pcuzner@redhat.com>

Mon, 2 Nov 2020 02:30:49 +0000 (15:30 +1300)
author Paul Cuzner <pcuzner@redhat.com>
Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)
committer Paul Cuzner <pcuzner@redhat.com>
Mon, 2 Nov 2020 02:30:49 +0000 (15:30 +1300)
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

index 51d19bfca963e9c94e413ff82977b9b67cc8e716..b14eb15460ccc7ee17739c1ac005fb50b8f81561 100644 (file)
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -241,3 +241,15 @@ groups:
            description: >
              Pool {{ $labels.name }} will be full in less than 5 days
              assuming the average fill-up rate of the past 48 hours.
+
+  - name: healthchecks
+    rules:
+      - alert: Slow OSD Ops
+        expr: ceph_healthcheck_slow_ops > 0
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index f50751a6774241a801c1e6820a44da639d82e6cc..f2abf00e4abbb01ad94b6669008e66cdbb832a7c 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -11,6 +11,7 @@ import time
  from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
  from mgr_util import get_default_addr, profile_method
  from rbd import RBD
+from collections import namedtuple
  try:
      from typing import Optional, Dict, Any, Set
  except:
@@ -108,6 +109,11 @@ DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
  
  NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
  
+alert_metric = namedtuple('alert_metric', 'name description')
+HEALTH_CHECKS = [
+    alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process' ),
+]
+
  
  class Metric(object):
      def __init__(self, mtype, name, desc, labels=None):
@@ -432,15 +438,62 @@ class Module(MgrModule):
                  'Number of {} objects'.format(state),
              )
  
+        for check in HEALTH_CHECKS:
+            path = 'healthcheck_{}'.format(check.name.lower())
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                check.description,
+            )
+
          return metrics
  
      @profile_method()
      def get_health(self):
+
+        def _get_value(message, delim=' ', word_pos=0):
+            """Extract value from message (default is 1st field)"""
+            v_str = message.split(delim)[word_pos]
+            if v_str.isdigit():
+                return int(v_str), 0
+            return 0, 1
+
          health = json.loads(self.get('health')['json'])
+        # set overall health
          self.metrics['health_status'].set(
              health_status_to_number(health['status'])
          )
  
+        # Examine the health to see if any health checks triggered need to
+        # become a metric.
+        active_healthchecks = health.get('checks', {})
+        active_names = active_healthchecks.keys()
+
+        for check in HEALTH_CHECKS:
+            path = 'healthcheck_{}'.format(check.name.lower())
+
+            if path in self.metrics:
+
+                if check.name in active_names:
+                    check_data = active_healthchecks[check.name]
+                    message = check_data['summary'].get('message', '')
+                    v, err = 0, 0
+
+                    if check.name == "SLOW_OPS":
+                        # 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have slow ops.
+                        v, err = _get_value(message)
+
+                    if err:
+                        self.log.error("healthcheck {} message format is incompatible and has been dropped".format(check.name))
+                        # drop the metric, so it's no longer emitted
+                        del self.metrics[path]
+                        continue
+                    else:
+                        self.metrics[path].set(v)
+                else:
+                    # health check is not active, so give it a default of 0
+                    self.metrics[path].set(0)
+
      @profile_method()
      def get_pool_stats(self):
          # retrieve pool stats to provide per pool recovery metrics
author	Paul Cuzner <pcuzner@redhat.com>
	Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)
committer	Paul Cuzner <pcuzner@redhat.com>
	Mon, 2 Nov 2020 02:30:49 +0000 (15:30 +1300)
monitoring/prometheus/alerts/ceph_default_alerts.yml		patch \| blob \| history
src/pybind/mgr/prometheus/module.py		patch \| blob \| history