From: yite.gu Date: Thu, 18 Apr 2024 07:51:11 +0000 (+0800) Subject: osd: add watch ping timeout count in osd X-Git-Tag: v20.0.0~1372^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=34b086e702f45a500b0d112c129fa890ef6c9f59;p=ceph.git osd: add watch ping timeout count in osd For example, rbd send a watch ping to the header object every 5 seconds to keep watch, if the primary OSD is unable to receive the watch ping of the header object due to rbd network interruption, this means that rbd's I/O has already been hang. This way, we can quickly detect disconnection rbds on the osd, and reflected in metrics. Signed-off-by: Yite Gu --- diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 62b8aad27d5b..df25bbc96016 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -11826,6 +11826,10 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch) oi.watchers.erase(make_pair(watch->get_cookie(), watch->get_entity())); + osd->logger->inc(l_osd_watch_timeouts); + dout(3) << __func__ << " watcher " << watch->get_peer_addr() + << " object " << obc->obs.oi.soid << dendl; + list watch_disconnects = { watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true) }; diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc index 30f0ba531665..a37fcbc29ba7 100644 --- a/src/osd/osd_perf_counters.cc +++ b/src/osd/osd_perf_counters.cc @@ -337,6 +337,10 @@ PerfCounters *build_osd_logger(CephContext *cct) { osd_plb.add_u64_counter_histogram( l_osd_scrub_reservation_dur_hist, "scrub_resrv_repnum_vs_duration", rsrv_hist_x_axis_config, rsrv_hist_y_axis_config, "Histogram of scrub replicas reservation duration"); + osd_plb.add_u64_counter( + l_osd_watch_timeouts, "watch_timeouts", + "Number of watches that timed out or were blocklisted", + NULL, PerfCountersBuilder::PRIO_USEFUL); return osd_plb.create_perf_counters(); } diff --git a/src/osd/osd_perf_counters.h b/src/osd/osd_perf_counters.h index 00127dd7ff5d..9ecbbd2368dc 100644 --- a/src/osd/osd_perf_counters.h +++ b/src/osd/osd_perf_counters.h @@ -136,6 +136,8 @@ enum { // are labeled, and histograms do not fully support labels. l_osd_scrub_reservation_dur_hist, + l_osd_watch_timeouts, + l_osd_last, };