mgr/prometheus: Add healthcheck metric for SLOW_OPS

author Paul Cuzner <pcuzner@redhat.com>

Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)

committer Paul Cuzner <pcuzner@redhat.com>

Sun, 28 Feb 2021 23:49:50 +0000 (12:49 +1300)
author Paul Cuzner <pcuzner@redhat.com>
Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)
committer Paul Cuzner <pcuzner@redhat.com>
Sun, 28 Feb 2021 23:49:50 +0000 (12:49 +1300)
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml

index eafc439fe401501555c9435c84859592139dd7b3..8fad80310e4caf01fd8f7a7b44f74376263d51ec 100644 (file)
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -236,3 +236,15 @@ groups:
            description: >
              Pool {{ $labels.name }} will be full in less than 5 days
              assuming the average fill-up rate of the past 48 hours.
+
+  - name: healthchecks
+    rules:
+      - alert: Slow OSD Ops
+        expr: ceph_healthcheck_slow_ops > 0
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py

index 9b2efc26f09f570bcf1bde5d30c2869927ae7df4..16fd6f7fdc781169aa85743821910bf00a2dd401 100644 (file)
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -11,7 +11,7 @@ import time
  from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES
  from mgr_util import get_default_addr
  from rbd import RBD
-
+from collections import namedtuple
  try:
      from typing import Optional
  except:
@@ -111,6 +111,11 @@ DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device',
  
  NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
  
+alert_metric = namedtuple('alert_metric', 'name description')
+HEALTH_CHECKS = [
+    alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process' ),
+]
+
  
  class Metric(object):
      def __init__(self, mtype, name, desc, labels=None):
@@ -204,7 +209,7 @@ class MetricCollectionThread(threading.Thread):
                      continue
  
                  duration = time.time() - start_time
-                
+
                  sleep_time = self.mod.scrape_interval - duration
                  if sleep_time < 0:
                      self.mod.log.warning(
@@ -452,14 +457,61 @@ class Module(MgrModule):
                  'Number of {} objects'.format(state),
              )
  
+        for check in HEALTH_CHECKS:
+            path = 'healthcheck_{}'.format(check.name.lower())
+            metrics[path] = Metric(
+                'gauge',
+                path,
+                check.description,
+            )
+
          return metrics
  
      def get_health(self):
+
+        def _get_value(message, delim=' ', word_pos=0):
+            """Extract value from message (default is 1st field)"""
+            v_str = message.split(delim)[word_pos]
+            if v_str.isdigit():
+                return int(v_str), 0
+            return 0, 1
+
          health = json.loads(self.get('health')['json'])
+        # set overall health
          self.metrics['health_status'].set(
              health_status_to_number(health['status'])
          )
  
+        # Examine the health to see if any health checks triggered need to
+        # become a metric.
+        active_healthchecks = health.get('checks', {})
+        active_names = active_healthchecks.keys()
+
+        for check in HEALTH_CHECKS:
+            path = 'healthcheck_{}'.format(check.name.lower())
+
+            if path in self.metrics:
+
+                if check.name in active_names:
+                    check_data = active_healthchecks[check.name]
+                    message = check_data['summary'].get('message', '')
+                    v, err = 0, 0
+
+                    if check.name == "SLOW_OPS":
+                        # 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have slow ops.
+                        v, err = _get_value(message)
+
+                    if err:
+                        self.log.error("healthcheck {} message format is incompatible and has been dropped".format(check.name))
+                        # drop the metric, so it's no longer emitted
+                        del self.metrics[path]
+                        continue
+                    else:
+                        self.metrics[path].set(v)
+                else:
+                    # health check is not active, so give it a default of 0
+                    self.metrics[path].set(0)
+
      def get_pool_stats(self):
          # retrieve pool stats to provide per pool recovery metrics
          # (osd_pool_stats moved to mgr in Mimic)
author	Paul Cuzner <pcuzner@redhat.com>
	Thu, 8 Oct 2020 03:30:56 +0000 (16:30 +1300)
committer	Paul Cuzner <pcuzner@redhat.com>
	Sun, 28 Feb 2021 23:49:50 +0000 (12:49 +1300)
monitoring/prometheus/alerts/ceph_default_alerts.yml		patch \| blob \| history
src/pybind/mgr/prometheus/module.py		patch \| blob \| history