From 587ee4225ed41744685a10567e308caaac617425 Mon Sep 17 00:00:00 2001 From: Pere Diaz Bou Date: Mon, 6 Mar 2023 10:25:21 +0100 Subject: [PATCH] msgr: AsyncMessenger add faulted connections metrics Add msgr_connection_idle_timeouts and msgr_connection_ready_timeouts labeled perfcounters to keep track of failed connections with prometheus metrics. Signed-off-by: Pere Diaz Bou Fixes: https://tracker.ceph.com/issues/59076 --- src/msg/async/AsyncConnection.cc | 3 +++ src/msg/async/AsyncConnection.h | 1 + src/msg/async/ProtocolV1.cc | 1 + src/msg/async/ProtocolV2.cc | 1 + src/msg/async/Stack.h | 46 +++++++++++++++++++++++++++++--- 5 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc index 8051f5907ef..683be086efa 100644 --- a/src/msg/async/AsyncConnection.cc +++ b/src/msg/async/AsyncConnection.cc @@ -116,6 +116,7 @@ AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQu : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()), logger(w->get_perf_counter()), + labeled_logger(w->get_labeled_perf_counter()), state(STATE_NONE), port(-1), dispatch_queue(q), recv_buf(NULL), recv_max_prefetch(std::max(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)), @@ -791,6 +792,7 @@ void AsyncConnection::tick(uint64_t id) << target_addr << ", fault." << dendl; protocol->fault(); + labeled_logger->inc(l_msgr_connection_ready_timeouts); } else { last_tick_id = center->create_time_event(connect_timeout_us, tick_handler); } @@ -803,6 +805,7 @@ void AsyncConnection::tick(uint64_t id) << " us, fault." << dendl; protocol->fault(); + labeled_logger->inc(l_msgr_connection_idle_timeouts); } else { last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler); } diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h index 82c29985b18..78a590f8ca3 100644 --- a/src/msg/async/AsyncConnection.h +++ b/src/msg/async/AsyncConnection.h @@ -173,6 +173,7 @@ public: AsyncMessenger *async_msgr; uint64_t conn_id; PerfCounters *logger; + PerfCounters *labeled_logger; int state; ConnectedSocket cs; int port; diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc index 9376d46b0bd..b45ad8ca515 100644 --- a/src/msg/async/ProtocolV1.cc +++ b/src/msg/async/ProtocolV1.cc @@ -2414,6 +2414,7 @@ CtPtr ProtocolV1::replace(const AsyncConnectionRef& existing, existing->worker->references--; new_worker->references++; existing->logger = new_worker->get_perf_counter(); + existing->labeled_logger = new_worker->get_labeled_perf_counter(); existing->worker = new_worker; existing->center = new_center; if (existing->delay_state) diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc index 7cda9637d90..08426b796b8 100644 --- a/src/msg/async/ProtocolV2.cc +++ b/src/msg/async/ProtocolV2.cc @@ -2808,6 +2808,7 @@ CtPtr ProtocolV2::reuse_connection(const AsyncConnectionRef& existing, existing->worker->references--; new_worker->references++; existing->logger = new_worker->get_perf_counter(); + existing->labeled_logger = new_worker->get_labeled_perf_counter(); existing->worker = new_worker; existing->center = new_center; if (existing->delay_state) diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h index 376a87c7268..e19b6c89ce7 100644 --- a/src/msg/async/Stack.h +++ b/src/msg/async/Stack.h @@ -17,10 +17,12 @@ #ifndef CEPH_MSG_ASYNC_STACK_H #define CEPH_MSG_ASYNC_STACK_H -#include "include/spinlock.h" #include "common/perf_counters.h" -#include "msg/msg_types.h" +#include "common/perf_counters_key.h" +#include "include/spinlock.h" #include "msg/async/Event.h" +#include "msg/msg_types.h" +#include class Worker; class ConnectedSocketImpl { @@ -214,6 +216,15 @@ enum { l_msgr_last, }; +enum { + l_msgr_labeled_first = l_msgr_last + 1, + + l_msgr_connection_ready_timeouts, + l_msgr_connection_idle_timeouts, + + l_msgr_labeled_last, +}; + class Worker { std::mutex init_lock; std::condition_variable init_cond; @@ -224,6 +235,7 @@ class Worker { CephContext *cct; PerfCounters *perf_logger; + PerfCounters *perf_labeled_logger; unsigned id; std::atomic_uint references; @@ -233,9 +245,11 @@ class Worker { Worker& operator=(const Worker&) = delete; Worker(CephContext *c, unsigned worker_id) - : cct(c), perf_logger(NULL), id(worker_id), references(0), center(c) { + : cct(c), id(worker_id), references(0), center(c) { char name[128]; - sprintf(name, "AsyncMessenger::Worker-%u", id); + char name_prefix[] = "AsyncMessenger::Worker"; + sprintf(name, "%s-%u", name_prefix, id); + // initialize perf_logger PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last); @@ -259,12 +273,35 @@ class Worker { perf_logger = plb.create_perf_counters(); cct->get_perfcounters_collection()->add(perf_logger); + + // Add labeled perfcounters + std::string labels = ceph::perf_counters::key_create( + name_prefix, {{"id", std::to_string(id)}}); + PerfCountersBuilder plb_labeled( + cct, labels, l_msgr_labeled_first, + l_msgr_labeled_last); + + plb_labeled.add_u64_counter( + l_msgr_connection_ready_timeouts, "msgr_connection_ready_timeouts", + "Number of not yet ready connections declared as dead", NULL, + PerfCountersBuilder::PRIO_USEFUL); + plb_labeled.add_u64_counter( + l_msgr_connection_idle_timeouts, "msgr_connection_idle_timeouts", + "Number of connections closed due to idleness", NULL, + PerfCountersBuilder::PRIO_USEFUL); + + perf_labeled_logger = plb_labeled.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_labeled_logger); } virtual ~Worker() { if (perf_logger) { cct->get_perfcounters_collection()->remove(perf_logger); delete perf_logger; } + if (perf_labeled_logger) { + cct->get_perfcounters_collection()->remove(perf_labeled_logger); + delete perf_labeled_logger; + } } virtual int listen(entity_addr_t &addr, unsigned addr_slot, @@ -275,6 +312,7 @@ class Worker { virtual void initialize() {} PerfCounters *get_perf_counter() { return perf_logger; } + PerfCounters *get_labeled_perf_counter() { return perf_labeled_logger; } void release_worker() { int oldref = references.fetch_sub(1); ceph_assert(oldref > 0); -- 2.47.3