.. automethod:: MgrModule.get_perf_schema
.. automethod:: MgrModule.get_counter
.. automethod:: MgrModule.get_mgr_id
+.. automethod:: MgrModule.get_daemon_health_metrics
Exposing health checks
----------------------
11,
9
),
+ $.addTableSchema(
+ '$datasource',
+ 'This table shows the 10 hosts with the highest number of slow ops',
+ { col: 2, desc: true },
+ [
+ $.overviewStyle('Instance', 'instance', 'string', 'short'),
+ $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Top Slow Ops per Host',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
]),
}
.addTargets([$.addTargetSchema(
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
)]),
+ $.addTableSchema(
+ '$datasource',
+ 'This table shows the 10 OSDs with the highest number of slow ops',
+ { col: 2, desc: true },
+ [
+ $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
+ $.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
+ $.overviewStyle('', '/.*/', 'hidden', 'short'),
+ ],
+ 'Top Slow Ops',
+ 'table'
+ )
+ .addTarget(
+ $.addTargetSchema(
+ |||
+ topk(10,
+ (ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
+ )
+ ||| % $.matchers(),
+ '',
+ 'table',
+ 1,
+ true
+ )
+ ) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
]),
'osd-device-details.json':
local OsdDeviceDetailsPanel(title,
"show": true
}
]
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "This table shows the 10 hosts with the highest number of slow ops",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 0,
+ "y": 40
+ },
+ "id": 15,
+ "links": [ ],
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Instance",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "instance",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Slow Ops",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top Slow Ops per Host",
+ "transform": "table",
+ "type": "table"
}
],
"refresh": "30s",
"show": true
}
]
+ },
+ {
+ "columns": [ ],
+ "datasource": "$datasource",
+ "description": "This table shows the 10 OSDs with the highest number of slow ops",
+ "gridPos": {
+ "h": 8,
+ "w": 4,
+ "x": 0,
+ "y": 20
+ },
+ "id": 13,
+ "links": [ ],
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "OSD ID",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "ceph_daemon",
+ "thresholds": [ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "Slow Ops",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value",
+ "thresholds": [ ],
+ "type": "number",
+ "unit": "none",
+ "valueMaps": [ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "/.*/",
+ "thresholds": [ ],
+ "type": "hidden",
+ "unit": "short",
+ "valueMaps": [ ]
+ }
+ ],
+ "targets": [
+ {
+ "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n",
+ "format": "table",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Top Slow Ops",
+ "transform": "table",
+ "type": "table"
}
],
"refresh": "30s",
description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
},
},
+ {
+ alert: 'CephDaemonSlowOps',
+ 'for': '30s',
+ expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0',
+ labels: { severity: 'warning', type: 'ceph_default' },
+ annotations: {
+ documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
+ summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
+ description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
+ },
+ },
],
},
{
labels:
severity: "warning"
type: "ceph_default"
+ - alert: "CephDaemonSlowOps"
+ for: "30s"
+ expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+ labels:
+ severity: 'warning'
+ type: 'ceph_default'
+ annotations:
+ summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+ description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
+ documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
- name: "cephadm"
rules:
- alert: "CephadmUpgradeFailed"
summary: OSD operations are slow to complete
description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
+ # slow daemon ops
+ - interval : 1m
+ input_series:
+ - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+ values: '1+0x120'
+ promql_expr_test:
+ - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
+ eval_time: 1m
+ exp_samples:
+ - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
+ job="ceph", type="SLOW_OPS"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 20m
+ alertname: CephDaemonSlowOps
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ ceph_daemon: "osd.1"
+ job: ceph
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+ summary: osd.1 operations are slow to complete
+ description: "osd.1 operations are taking too long to process (complaint time exceeded)"
+
# CEPHADM orchestrator alert triggers
- interval: 30s
input_series:
dout(7) << "unregistering msgr client handle " << addrv << dendl;
py_module_registry.unregister_client(name, addrv);
}
+
+PyObject* ActivePyModules::get_daemon_health_metrics()
+{
+ without_gil_t no_gil;
+ return daemon_state.with_daemons_by_server([&no_gil]
+ (const std::map<std::string, DaemonStateCollection> &all) {
+ no_gil.acquire_gil();
+ PyFormatter f;
+ for (const auto &[hostname, daemon_state] : all) {
+ for (const auto &[key, state] : daemon_state) {
+ f.open_array_section(ceph::to_string(key));
+ for (const auto &metric : state->daemon_health_metrics) {
+ f.open_object_section(metric.get_type_name());
+ f.dump_int("value", metric.get_n1());
+ f.dump_string("type", metric.get_type_name());
+ f.close_section();
+ }
+ f.close_section();
+ }
+ }
+ return f.get();
+ });
+}
void cluster_log(const std::string &channel, clog_type prio,
const std::string &message);
+ PyObject* get_daemon_health_metrics();
bool inject_python_on() const;
void update_cache_metrics();
Py_RETURN_NONE;
}
+static PyObject*
+ceph_get_daemon_health_metrics(BaseMgrModule *self, PyObject *args)
+{
+ return self->py_modules->get_daemon_health_metrics();
+}
+
PyMethodDef BaseMgrModule_methods[] = {
{"_ceph_get", (PyCFunction)ceph_state_get, METH_VARARGS,
"Get a cluster object"},
{"_ceph_unregister_client", (PyCFunction)ceph_unregister_client,
METH_VARARGS, "Unregister RADOS instance for potential blocklisting"},
+ {"_ceph_get_daemon_health_metrics", (PyCFunction)ceph_get_daemon_health_metrics,
+ METH_VARARGS, "Get health metrics for all daemons"},
+
{NULL, NULL, 0, NULL}
};
: type(type_), value(n)
{}
DaemonHealthMetric(daemon_metric type_, uint32_t n1, uint32_t n2)
- : type(type_), value(n1, n2)
+ : type(type_), value(n1, n2)
{}
+
daemon_metric get_type() const {
return type;
}
uint32_t get_n2() const {
return value.n2;
}
+
DENC(DaemonHealthMetric, v, p) {
DENC_START(1, 1, p);
denc(v.type, p);
DENC_FINISH(p);
}
+ std::string get_type_name() const {
+ return daemon_metric_name(get_type());
+ }
+
friend std::ostream& operator<<(std::ostream& out, const DaemonHealthMetric& m) {
return out << daemon_metric_name(m.get_type()) << "("
<< m.get_n() << "|(" << m.get_n1() << "," << m.get_n2() << "))";
<< std::dec << dendl;
continue;
}
- dout(20) << " + " << state->key << " "
- << metric << dendl;
tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
std::move(collector));
}
def _ceph_unregister_client(self, addrs: str) -> None: ...
def _ceph_register_client(self, addrs: str) -> None: ...
def _ceph_is_authorized(self, arguments: Dict[str, str]) -> bool: ...
+ def _ceph_get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]: ...
"""
return self._ceph_get_mds_perf_counters(query_id)
+ def get_daemon_health_metrics(self) -> Dict[str, List[Dict[str, Any]]]:
+ """
+ Get the list of health metrics per daemon. This includes SLOW_OPS health metrics
+ in MON and OSD daemons, and PENDING_CREATING_PGS health metrics for OSDs.
+ """
+ return self._ceph_get_daemon_health_metrics()
+
def is_authorized(self, arguments: Dict[str, str]) -> bool:
"""
Verifies that the current session caps permit executing the py service
self.metrics[path].set(stats['stat_sum']['num_objects_repaired'],
labelvalues=(stats['poolid'],))
+ def get_all_daemon_health_metrics(self) -> None:
+ daemon_metrics = self.get_daemon_health_metrics()
+ self.log.debug('metrics jeje %s' % (daemon_metrics))
+ for daemon_name, health_metrics in daemon_metrics.items():
+ for health_metric in health_metrics:
+ path = f'daemon_health_metrics{daemon_name}{health_metric["type"]}'
+ self.metrics[path] = Metric(
+ 'counter',
+ 'daemon_health_metrics',
+ 'Health metrics for Ceph daemons',
+ ('type', 'ceph_daemon',)
+ )
+ self.metrics[path].set(health_metric['value'], labelvalues=(
+ health_metric['type'], daemon_name,))
+
@profile_method(True)
def collect(self) -> str:
# Clear the metrics before scraping
self.get_pg_status()
self.get_pg_repaired_objects()
self.get_num_objects()
+ self.get_all_daemon_health_metrics()
for daemon, counters in self.get_all_perf_counters().items():
for path, counter_info in counters.items():