]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
msgr: AsyncMessenger add faulted connections metrics
authorPere Diaz Bou <pdiazbou@redhat.com>
Mon, 6 Mar 2023 09:25:21 +0000 (10:25 +0100)
committerPere Diaz Bou <pere-altea@hotmail.com>
Mon, 5 Jun 2023 07:44:01 +0000 (09:44 +0200)
Add msgr_connection_idle_timeouts and msgr_connection_ready_timeouts
labeled perfcounters to keep track of failed connections with prometheus metrics.

Signed-off-by: Pere Diaz Bou <pdiazbou@redhat.com>
Fixes: https://tracker.ceph.com/issues/59076
src/msg/async/AsyncConnection.cc
src/msg/async/AsyncConnection.h
src/msg/async/ProtocolV1.cc
src/msg/async/ProtocolV2.cc
src/msg/async/Stack.h

index 8051f5907ef113ebef3b6775aef52a2bb0b786e9..683be086efadad84a9221d45e31ba2dbed2bdc43 100644 (file)
@@ -116,6 +116,7 @@ AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQu
   : Connection(cct, m),
     delay_state(NULL), async_msgr(m), conn_id(q->get_id()),
     logger(w->get_perf_counter()),
+    labeled_logger(w->get_labeled_perf_counter()),
     state(STATE_NONE), port(-1),
     dispatch_queue(q), recv_buf(NULL),
     recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
@@ -791,6 +792,7 @@ void AsyncConnection::tick(uint64_t id)
                                 << target_addr << ", fault."
                                 << dendl;
       protocol->fault();
+      labeled_logger->inc(l_msgr_connection_ready_timeouts);
     } else {
       last_tick_id = center->create_time_event(connect_timeout_us, tick_handler);
     }
@@ -803,6 +805,7 @@ void AsyncConnection::tick(uint64_t id)
                                 << " us, fault."
                                 << dendl;
       protocol->fault();
+      labeled_logger->inc(l_msgr_connection_idle_timeouts);
     } else {
       last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler);
     }
index 82c29985b18d9e7ef9cb56b8d7f71c9ea0d51843..78a590f8ca393eaef103cf4303d19373a2febfce 100644 (file)
@@ -173,6 +173,7 @@ public:
   AsyncMessenger *async_msgr;
   uint64_t conn_id;
   PerfCounters *logger;
+  PerfCounters *labeled_logger;
   int state;
   ConnectedSocket cs;
   int port;
index 9376d46b0bd2ef1af6e4e7087210eb7646699f02..b45ad8ca5155f37dcadbe2ef7b1dffa4b5122deb 100644 (file)
@@ -2414,6 +2414,7 @@ CtPtr ProtocolV1::replace(const AsyncConnectionRef& existing,
               existing->worker->references--;
               new_worker->references++;
               existing->logger = new_worker->get_perf_counter();
+              existing->labeled_logger = new_worker->get_labeled_perf_counter();
               existing->worker = new_worker;
               existing->center = new_center;
               if (existing->delay_state)
index 7cda9637d90f0f0d65f3a9c27b09e6a2043e256c..08426b796b88b16c9e0142a7dc7d58d2d8d071f7 100644 (file)
@@ -2808,6 +2808,7 @@ CtPtr ProtocolV2::reuse_connection(const AsyncConnectionRef& existing,
             existing->worker->references--;
             new_worker->references++;
             existing->logger = new_worker->get_perf_counter();
+            existing->labeled_logger = new_worker->get_labeled_perf_counter();
             existing->worker = new_worker;
             existing->center = new_center;
             if (existing->delay_state)
index 376a87c7268771817d507c0120573dea48f715d9..e19b6c89ce79034949ccfaf5f196e640c5d5b69f 100644 (file)
 #ifndef CEPH_MSG_ASYNC_STACK_H
 #define CEPH_MSG_ASYNC_STACK_H
 
-#include "include/spinlock.h"
 #include "common/perf_counters.h"
-#include "msg/msg_types.h"
+#include "common/perf_counters_key.h"
+#include "include/spinlock.h"
 #include "msg/async/Event.h"
+#include "msg/msg_types.h"
+#include <string>
 
 class Worker;
 class ConnectedSocketImpl {
@@ -214,6 +216,15 @@ enum {
   l_msgr_last,
 };
 
+enum {
+  l_msgr_labeled_first = l_msgr_last + 1,
+
+  l_msgr_connection_ready_timeouts,
+  l_msgr_connection_idle_timeouts,
+
+  l_msgr_labeled_last,
+};
+
 class Worker {
   std::mutex init_lock;
   std::condition_variable init_cond;
@@ -224,6 +235,7 @@ class Worker {
 
   CephContext *cct;
   PerfCounters *perf_logger;
+  PerfCounters *perf_labeled_logger;
   unsigned id;
 
   std::atomic_uint references;
@@ -233,9 +245,11 @@ class Worker {
   Worker& operator=(const Worker&) = delete;
 
   Worker(CephContext *c, unsigned worker_id)
-    : cct(c), perf_logger(NULL), id(worker_id), references(0), center(c) {
+    : cct(c), id(worker_id), references(0), center(c) {
     char name[128];
-    sprintf(name, "AsyncMessenger::Worker-%u", id);
+    char name_prefix[] = "AsyncMessenger::Worker";
+    sprintf(name, "%s-%u", name_prefix, id);
+
     // initialize perf_logger
     PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
 
@@ -259,12 +273,35 @@ class Worker {
 
     perf_logger = plb.create_perf_counters();
     cct->get_perfcounters_collection()->add(perf_logger);
+
+    // Add labeled perfcounters
+    std::string labels = ceph::perf_counters::key_create(
+      name_prefix, {{"id", std::to_string(id)}});
+    PerfCountersBuilder plb_labeled(
+        cct, labels, l_msgr_labeled_first,
+        l_msgr_labeled_last);
+
+    plb_labeled.add_u64_counter(
+        l_msgr_connection_ready_timeouts, "msgr_connection_ready_timeouts",
+        "Number of not yet ready connections declared as dead", NULL,
+        PerfCountersBuilder::PRIO_USEFUL);
+    plb_labeled.add_u64_counter(
+        l_msgr_connection_idle_timeouts, "msgr_connection_idle_timeouts",
+        "Number of connections closed due to idleness", NULL,
+        PerfCountersBuilder::PRIO_USEFUL);
+
+    perf_labeled_logger = plb_labeled.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_labeled_logger);
   }
   virtual ~Worker() {
     if (perf_logger) {
       cct->get_perfcounters_collection()->remove(perf_logger);
       delete perf_logger;
     }
+    if (perf_labeled_logger) {
+      cct->get_perfcounters_collection()->remove(perf_labeled_logger);
+      delete perf_labeled_logger;
+    }
   }
 
   virtual int listen(entity_addr_t &addr, unsigned addr_slot,
@@ -275,6 +312,7 @@ class Worker {
 
   virtual void initialize() {}
   PerfCounters *get_perf_counter() { return perf_logger; }
+  PerfCounters *get_labeled_perf_counter() { return perf_labeled_logger; }
   void release_worker() {
     int oldref = references.fetch_sub(1);
     ceph_assert(oldref > 0);