]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: add watch ping timeout count in osd 56976/head
authoryite.gu <yitegu0@gmail.com>
Thu, 18 Apr 2024 07:51:11 +0000 (15:51 +0800)
committeryite.gu <yitegu0@gmail.com>
Fri, 19 Apr 2024 09:33:18 +0000 (17:33 +0800)
For example, rbd send a watch ping to the header object
every 5 seconds to keep watch, if the primary OSD is unable
to receive the watch ping of the header object due to rbd
network interruption, this means that rbd's I/O has already been
hang. This way, we can quickly detect disconnection rbds on the osd,
and reflected in metrics.

Signed-off-by: Yite Gu <yitegu0@gmail.com>
src/osd/PrimaryLogPG.cc
src/osd/osd_perf_counters.cc
src/osd/osd_perf_counters.h

index 62b8aad27d5b9de6fda22e61ae1c9f9a9b891bec..df25bbc96016c6bc171eb08c15ea92a31e4b04c6 100644 (file)
@@ -11826,6 +11826,10 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
   oi.watchers.erase(make_pair(watch->get_cookie(),
                              watch->get_entity()));
 
+  osd->logger->inc(l_osd_watch_timeouts);
+  dout(3) << __func__ << " watcher " << watch->get_peer_addr()
+         << " object " << obc->obs.oi.soid << dendl;
+
   list<watch_disconnect_t> watch_disconnects = {
     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
   };
index 30f0ba531665479a124f704e27d2c2f22f1b8d77..a37fcbc29ba77dc383037f685d9ba327a2b2bb3d 100644 (file)
@@ -337,6 +337,10 @@ PerfCounters *build_osd_logger(CephContext *cct) {
   osd_plb.add_u64_counter_histogram(
       l_osd_scrub_reservation_dur_hist, "scrub_resrv_repnum_vs_duration",
       rsrv_hist_x_axis_config, rsrv_hist_y_axis_config, "Histogram of scrub replicas reservation duration");
+  osd_plb.add_u64_counter(
+  l_osd_watch_timeouts, "watch_timeouts",
+  "Number of watches that timed out or were blocklisted",
+  NULL, PerfCountersBuilder::PRIO_USEFUL);
 
   return osd_plb.create_perf_counters();
 }
index 00127dd7ff5d29f24a68a97c2811b87a18f3384d..9ecbbd2368dc3e97d61d9a7ef68b27bafdc39f99 100644 (file)
@@ -136,6 +136,8 @@ enum {
   // are labeled, and histograms do not fully support labels.
   l_osd_scrub_reservation_dur_hist,
 
+  l_osd_watch_timeouts,
+
   l_osd_last,
 };