From: Paul Cuzner Date: Thu, 8 Oct 2020 03:30:56 +0000 (+1300) Subject: mgr/prometheus: Add healthcheck metric for SLOW_OPS X-Git-Tag: v14.2.17~4^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cb0d3154a676edb955467dea42eb931743dd67a1;p=ceph.git mgr/prometheus: Add healthcheck metric for SLOW_OPS SLOW_OPS is triggered by op tracker, and generates a health alert but healthchecks do not create metrics for prometheus to use as alert triggers. This change adds SLOW_OPS metric, and provides a simple means to extend to other relevant health checks in the future If the extract of the value from the health check message fails we log an error and remove the metric from the metric set. In addition the metric description has changed to better reflect the scenarios where SLOW_OPS can be triggered. Signed-off-by: Paul Cuzner (cherry picked from commit 2010432b5045c8f1dd0c052def497e590a1bf2fe) Conflicts: src/pybind/mgr/prometheus/module.py - Nautilus doesn't have the profile_method decorator, so needed to be removed --- diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index eafc439fe40..8fad80310e4 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -236,3 +236,15 @@ groups: description: > Pool {{ $labels.name }} will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. + + - name: healthchecks + rules: + - alert: Slow OSD Ops + expr: ceph_healthcheck_slow_ops > 0 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + description: > + {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 9b2efc26f09..16fd6f7fdc7 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -11,7 +11,7 @@ import time from mgr_module import MgrModule, MgrStandbyModule, CommandResult, PG_STATES from mgr_util import get_default_addr from rbd import RBD - +from collections import namedtuple try: from typing import Optional except: @@ -111,6 +111,11 @@ DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', NUM_OBJECTS = ['degraded', 'misplaced', 'unfound'] +alert_metric = namedtuple('alert_metric', 'name description') +HEALTH_CHECKS = [ + alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process' ), +] + class Metric(object): def __init__(self, mtype, name, desc, labels=None): @@ -204,7 +209,7 @@ class MetricCollectionThread(threading.Thread): continue duration = time.time() - start_time - + sleep_time = self.mod.scrape_interval - duration if sleep_time < 0: self.mod.log.warning( @@ -452,14 +457,61 @@ class Module(MgrModule): 'Number of {} objects'.format(state), ) + for check in HEALTH_CHECKS: + path = 'healthcheck_{}'.format(check.name.lower()) + metrics[path] = Metric( + 'gauge', + path, + check.description, + ) + return metrics def get_health(self): + + def _get_value(message, delim=' ', word_pos=0): + """Extract value from message (default is 1st field)""" + v_str = message.split(delim)[word_pos] + if v_str.isdigit(): + return int(v_str), 0 + return 0, 1 + health = json.loads(self.get('health')['json']) + # set overall health self.metrics['health_status'].set( health_status_to_number(health['status']) ) + # Examine the health to see if any health checks triggered need to + # become a metric. + active_healthchecks = health.get('checks', {}) + active_names = active_healthchecks.keys() + + for check in HEALTH_CHECKS: + path = 'healthcheck_{}'.format(check.name.lower()) + + if path in self.metrics: + + if check.name in active_names: + check_data = active_healthchecks[check.name] + message = check_data['summary'].get('message', '') + v, err = 0, 0 + + if check.name == "SLOW_OPS": + # 42 slow ops, oldest one blocked for 12 sec, daemons [osd.0, osd.3] have slow ops. + v, err = _get_value(message) + + if err: + self.log.error("healthcheck {} message format is incompatible and has been dropped".format(check.name)) + # drop the metric, so it's no longer emitted + del self.metrics[path] + continue + else: + self.metrics[path].set(v) + else: + # health check is not active, so give it a default of 0 + self.metrics[path].set(0) + def get_pool_stats(self): # retrieve pool stats to provide per pool recovery metrics # (osd_pool_stats moved to mgr in Mimic)