From: Pere Diaz Bou <pdiazbou@redhat.com>
Date: Fri, 11 Nov 2022 09:43:01 +0000 (+0100)
Subject: mgr/prometheus: expose daemon health metrics
X-Git-Tag: v17.2.6~264^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6a851ba3beee3cca058fc3a0932d2b4878ff335d;p=ceph.git

mgr/prometheus: expose daemon health metrics

Until now daemon health metrics were stored without being used. One of
the most helpful metrics there is SLOW_OPS with respect to OSDs and MONs
which this commit tries to expose to bring fine grained metrics to find
troublesome OSDs instead of having a lone healthcheck of slow ops in the
whole cluster.

Signed-off-by: Pere Diaz Bou <pdiazbou@redhat.com>
(cherry picked from commit 5a2b7c25b68f2c955356640041e4c7ed72416d4e)
---

diff --git a/doc/mgr/modules.rst b/doc/mgr/modules.rst
index 454839e2abd..66766413973 100644
--- a/doc/mgr/modules.rst
+++ b/doc/mgr/modules.rst
@@ -508,6 +508,7 @@ function. This will result in a circular locking exception.
 .. automethod:: MgrModule.get_perf_schema
 .. automethod:: MgrModule.get_counter
 .. automethod:: MgrModule.get_mgr_id
+.. automethod:: MgrModule.get_daemon_health_metrics
 
 Exposing health checks
 ----------------------
diff --git a/monitoring/ceph-mixin/dashboards/host.libsonnet b/monitoring/ceph-mixin/dashboards/host.libsonnet
index 3e0b31f2c45..1c66120af9c 100644
--- a/monitoring/ceph-mixin/dashboards/host.libsonnet
+++ b/monitoring/ceph-mixin/dashboards/host.libsonnet
@@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet';
         11,
         9
       ),
+      $.addTableSchema(
+        '$datasource',
+        'This table shows the 10 hosts with the highest number of slow ops',
+        { col: 2, desc: true },
+        [
+          $.overviewStyle('Instance', 'instance', 'string', 'short'),
+          $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+          $.overviewStyle('', '/.*/', 'hidden', 'short'),
+        ],
+        'Top Slow Ops per Host',
+        'table'
+      )
+      .addTarget(
+        $.addTargetSchema(
+          |||
+            topk(10,
+              (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
+            )
+          ||| % $.matchers(),
+          '',
+          'table',
+          1,
+          true
+        )
+      ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
     ]),
 }
diff --git a/monitoring/ceph-mixin/dashboards/osd.libsonnet b/monitoring/ceph-mixin/dashboards/osd.libsonnet
index 129b74ba666..0ea43c96ff9 100644
--- a/monitoring/ceph-mixin/dashboards/osd.libsonnet
+++ b/monitoring/ceph-mixin/dashboards/osd.libsonnet
@@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet';
       .addTargets([$.addTargetSchema(
         'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
       )]),
+      $.addTableSchema(
+        '$datasource',
+        'This table shows the 10 OSDs with the highest number of slow ops',
+        { col: 2, desc: true },
+        [
+          $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+          $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+          $.overviewStyle('', '/.*/', 'hidden', 'short'),
+        ],
+        'Top Slow Ops',
+        'table'
+      )
+      .addTarget(
+        $.addTargetSchema(
+          |||
+            topk(10,
+              (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
+            )
+          ||| % $.matchers(),
+          '',
+          'table',
+          1,
+          true
+        )
+      ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
     ]),
   'osd-device-details.json':
     local OsdDeviceDetailsPanel(title,
diff --git a/monitoring/ceph-mixin/dashboards_out/host-details.json b/monitoring/ceph-mixin/dashboards_out/host-details.json
index 93c51f00941..b8e08697a98 100644
--- a/monitoring/ceph-mixin/dashboards_out/host-details.json
+++ b/monitoring/ceph-mixin/dashboards_out/host-details.json
@@ -1119,6 +1119,91 @@
                "show": true
             }
          ]
+      },
+      {
+         "columns": [ ],
+         "datasource": "$datasource",
+         "description": "This table shows the 10 hosts with the highest number of slow ops",
+         "gridPos": {
+            "h": 8,
+            "w": 4,
+            "x": 0,
+            "y": 40
+         },
+         "id": 15,
+         "links": [ ],
+         "sort": {
+            "col": 2,
+            "desc": true
+         },
+         "styles": [
+            {
+               "alias": "Instance",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "instance",
+               "thresholds": [ ],
+               "type": "string",
+               "unit": "short",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "Slow Ops",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "Value",
+               "thresholds": [ ],
+               "type": "number",
+               "unit": "none",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "/.*/",
+               "thresholds": [ ],
+               "type": "hidden",
+               "unit": "short",
+               "valueMaps": [ ]
+            }
+         ],
+         "targets": [
+            {
+               "expr": "topk(10,\n  (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n",
+               "format": "table",
+               "instant": true,
+               "intervalFactor": 1,
+               "legendFormat": "",
+               "refId": "A"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Top Slow Ops per Host",
+         "transform": "table",
+         "type": "table"
       }
    ],
    "refresh": "30s",
diff --git a/monitoring/ceph-mixin/dashboards_out/osds-overview.json b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
index 5ea8955b297..b34c6642263 100644
--- a/monitoring/ceph-mixin/dashboards_out/osds-overview.json
+++ b/monitoring/ceph-mixin/dashboards_out/osds-overview.json
@@ -860,6 +860,91 @@
                "show": true
             }
          ]
+      },
+      {
+         "columns": [ ],
+         "datasource": "$datasource",
+         "description": "This table shows the 10 OSDs with the highest number of slow ops",
+         "gridPos": {
+            "h": 8,
+            "w": 4,
+            "x": 0,
+            "y": 20
+         },
+         "id": 13,
+         "links": [ ],
+         "sort": {
+            "col": 2,
+            "desc": true
+         },
+         "styles": [
+            {
+               "alias": "OSD ID",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "ceph_daemon",
+               "thresholds": [ ],
+               "type": "string",
+               "unit": "short",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "Slow Ops",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "Value",
+               "thresholds": [ ],
+               "type": "number",
+               "unit": "none",
+               "valueMaps": [ ]
+            },
+            {
+               "alias": "",
+               "colorMode": null,
+               "colors": [
+                  "rgba(245, 54, 54, 0.9)",
+                  "rgba(237, 129, 40, 0.89)",
+                  "rgba(50, 172, 45, 0.97)"
+               ],
+               "dateFormat": "YYYY-MM-DD HH:mm:ss",
+               "decimals": 2,
+               "mappingType": 1,
+               "pattern": "/.*/",
+               "thresholds": [ ],
+               "type": "hidden",
+               "unit": "short",
+               "valueMaps": [ ]
+            }
+         ],
+         "targets": [
+            {
+               "expr": "topk(10,\n  (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n",
+               "format": "table",
+               "instant": true,
+               "intervalFactor": 1,
+               "legendFormat": "",
+               "refId": "A"
+            }
+         ],
+         "timeFrom": null,
+         "timeShift": null,
+         "title": "Top Slow Ops",
+         "transform": "table",
+         "type": "table"
       }
    ],
    "refresh": "30s",
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index bed89a87906..7977e4035ec 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -629,6 +629,17 @@
             description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
           },
         },
+        {
+          alert: 'CephDaemonSlowOps',
+          'for': '30s',
+          expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0',
+          labels: { severity: 'warning', type: 'ceph_default' },
+          annotations: {
+            documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
+            summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
+            description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
+          },
+        },
       ],
     },
     {
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index bd773f27c54..33a5c5059b8 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -563,6 +563,16 @@ groups:
         labels:
           severity: "warning"
           type: "ceph_default"
+      - alert: "CephDaemonSlowOps"
+        for: "30s"
+        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+        labels: 
+          severity: 'warning'
+          type: 'ceph_default'
+        annotations:
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+          description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
+          documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
   - name: "cephadm"
     rules:
       - alert: "CephadmUpgradeFailed"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 8cdb5634936..d68b43badf2 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -679,6 +679,33 @@ tests:
            summary: OSD operations are slow to complete
            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 
+ # slow daemon ops
+ - interval : 1m
+   input_series:
+    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+      values: '1+0x120'
+   promql_expr_test:
+     - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
+           job="ceph", type="SLOW_OPS"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 20m
+       alertname: CephDaemonSlowOps
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           ceph_daemon: "osd.1"
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+           summary: osd.1 operations are slow to complete
+           description: "osd.1 operations are taking too long to process (complaint time exceeded)"
+
 # CEPHADM orchestrator alert triggers
  - interval: 30s
    input_series:
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
index 58c3d9ee4d6..8c070dd8cbf 100644
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -1515,3 +1515,26 @@ void ActivePyModules::unregister_client(std::string_view name, std::string addrs
   dout(7) << "unregistering msgr client handle " << addrv << dendl;
   py_module_registry.unregister_client(name, addrv);
 }
+
+PyObject* ActivePyModules::get_daemon_health_metrics()
+{
+  without_gil_t no_gil;
+  return daemon_state.with_daemons_by_server([&no_gil]
+      (const std::map<std::string, DaemonStateCollection> &all) {
+      no_gil.acquire_gil();
+      PyFormatter f;
+      for (const auto &[hostname, daemon_state] : all) {
+        for (const auto &[key, state] : daemon_state) {
+          f.open_array_section(ceph::to_string(key));
+          for (const auto &metric : state->daemon_health_metrics) {
+            f.open_object_section(metric.get_type_name());
+            f.dump_int("value", metric.get_n1());
+            f.dump_string("type", metric.get_type_name());
+            f.close_section();
+          }
+          f.close_section();
+        }
+      }
+      return f.get();
+  });
+}
diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h
index 218497c1e89..6015084e3cd 100644
--- a/src/mgr/ActivePyModules.h
+++ b/src/mgr/ActivePyModules.h
@@ -222,6 +222,7 @@ public:
 
   void cluster_log(const std::string &channel, clog_type prio,
     const std::string &message);
+  PyObject* get_daemon_health_metrics();
 
   bool inject_python_on() const;
   void update_cache_metrics();
diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc
index ca441d5e539..4fb5b250b98 100644
--- a/src/mgr/BaseMgrModule.cc
+++ b/src/mgr/BaseMgrModule.cc
@@ -1411,6 +1411,12 @@ ceph_unregister_client(BaseMgrModule *self, PyObject *args)
   Py_RETURN_NONE;
 }
 
+static PyObject*
+ceph_get_daemon_health_metrics(BaseMgrModule *self, PyObject *args)
+{
+  return self->py_modules->get_daemon_health_metrics();
+}
+
 PyMethodDef BaseMgrModule_methods[] = {
   {"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
    "Get a cluster object"},
@@ -1540,6 +1546,9 @@ PyMethodDef BaseMgrModule_methods[] = {
   {"_ceph_unregister_client", (PyCFunction)ceph_unregister_client,
     METH_VARARGS, "Unregister RADOS instance for potential blocklisting"},
 
+  {"_ceph_get_daemon_health_metrics", (PyCFunction)ceph_get_daemon_health_metrics,
+    METH_VARARGS, "Get health metrics for all daemons"},
+
   {NULL, NULL, 0, NULL}
 };
 
diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h
index ad3ea29efd4..ce0dad2c87e 100644
--- a/src/mgr/DaemonHealthMetric.h
+++ b/src/mgr/DaemonHealthMetric.h
@@ -44,8 +44,9 @@ public:
     : type(type_), value(n)
   {}
   DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2)
-    : type(type_), value(n1, n2)
+    : type(type_), value(n1, n2) 
   {}
+
   daemon_metric get_type() const {
     return type;
   }
@@ -58,6 +59,7 @@ public:
   uint32_t get_n2() const {
     return value.n2;
   }
+
   DENC(DaemonHealthMetric, v, p) {
     DENC_START(1, 1, p);
     denc(v.type, p);
@@ -65,6 +67,10 @@ public:
     DENC_FINISH(p);
   }
 
+  std::string get_type_name() const {
+    return daemon_metric_name(get_type());
+  }
+
   friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) {
     return out << daemon_metric_name(m.get_type()) << "("
 	       << m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))";
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index 1b4744cf6b3..4afae0c20d8 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -2640,8 +2640,6 @@ void DaemonServer::send_report()
 		 << std::dec << dendl;
             continue;
           }
-	  dout(20) << " + " << state->key << " "
-		   << metric << dendl;
           tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
               std::move(collector));
         }
diff --git a/src/pybind/mgr/ceph_module.pyi b/src/pybind/mgr/ceph_module.pyi
index 17191929576..b89402d01be 100644
--- a/src/pybind/mgr/ceph_module.pyi
+++ b/src/pybind/mgr/ceph_module.pyi
@@ -115,3 +115,4 @@ class BaseMgrModule(object):
     def _ceph_unregister_client(self, addrs: str) -> None: ...
     def _ceph_register_client(self, addrs: str) -> None: ...
     def _ceph_is_authorized(self, arguments: Dict[str, str]) -> bool: ...
+    def _ceph_get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]: ...
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 4afea482eef..ff15ff69c77 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -2273,6 +2273,13 @@ class MgrModule(ceph_module.BaseMgrModule, MgrModuleLoggingMixin):
         """
         return self._ceph_get_mds_perf_counters(query_id)
 
+    def get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Get the list of health metrics per daemon. This includes SLOW_OPS health metrics
+        in MON and OSD daemons, and PENDING_CREATING_PGS health metrics for OSDs.
+        """
+        return self._ceph_get_daemon_health_metrics()
+
     def is_authorized(self, arguments: Dict[str, str]) -> bool:
         """
         Verifies that the current session caps permit executing the py service
diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
index 9afb24f7acb..b06a73241d9 100644
--- a/src/pybind/mgr/prometheus/module.py
+++ b/src/pybind/mgr/prometheus/module.py
@@ -1597,6 +1597,21 @@ class Module(MgrModule):
             self.metrics[path].set(stats['stat_sum']['num_objects_repaired'],
                                    labelvalues=(stats['poolid'],))
 
+    def get_all_daemon_health_metrics(self) -> None:
+        daemon_metrics = self.get_daemon_health_metrics()
+        self.log.debug('metrics jeje %s' % (daemon_metrics))
+        for daemon_name, health_metrics in daemon_metrics.items():
+            for health_metric in health_metrics:
+                path = f'daemon_health_metrics{daemon_name}{health_metric["type"]}'
+                self.metrics[path] = Metric(
+                    'counter',
+                    'daemon_health_metrics',
+                    'Health metrics for Ceph daemons',
+                    ('type', 'ceph_daemon',)
+                )
+                self.metrics[path].set(health_metric['value'], labelvalues=(
+                    health_metric['type'], daemon_name,))
+
     @profile_method(True)
     def collect(self) -> str:
         # Clear the metrics before scraping
@@ -1615,6 +1630,7 @@ class Module(MgrModule):
         self.get_pg_status()
         self.get_pool_repaired_objects()
         self.get_num_objects()
+        self.get_all_daemon_health_metrics()
 
         for daemon, counters in self.get_all_perf_counters().items():
             for path, counter_info in counters.items():