From: Oguzhan Ozmen Date: Mon, 27 Apr 2026 23:07:03 +0000 (+0000) Subject: rgw/multisite: expose lock latency as perf counter for data sync X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=eb42801b6f0c34d06419ad4e44631eaefdb209d1;p=ceph.git rgw/multisite: expose lock latency as perf counter for data sync Add a "lock_latency" perf counter to the per-zone data sync counter. This tracks the latency of RADOS lock/unlock operations in RGWContinuousLeaseCR, giving operators visibility into the values driving the LatencyConcurrencyControl. The new perf counter can be queried via the admin socket: ceph daemon perf dump data-sync-from- and reset independently: ceph daemon perf reset data-sync-from- This would allow us to distinguish a poisoned average from ongoing OSD latency issues without restarting the RGW process. Signed-off-by: Oguzhan Ozmen --- diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc index fe44103a649d..92ed99ab7e93 100644 --- a/src/rgw/driver/rados/rgw_cr_rados.cc +++ b/src/rgw/driver/rados/rgw_cr_rados.cc @@ -1035,7 +1035,11 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp) current_time = ceph::coarse_mono_clock::now(); yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval)); if (latency) { - latency->add_latency(ceph::coarse_mono_clock::now() - current_time); + auto elapsed = ceph::coarse_mono_clock::now() - current_time; + latency->add_latency(elapsed); + if (counters) { + counters->tinc(sync_counters::l_lock, elapsed); + } } current_time = ceph::coarse_mono_clock::now(); if (current_time - last_renew_try_time > interval_tolerance) { @@ -1059,7 +1063,11 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp) current_time = ceph::coarse_mono_clock::now(); yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie)); if (latency) { - latency->add_latency(ceph::coarse_mono_clock::now() - current_time); + auto elapsed = ceph::coarse_mono_clock::now() - current_time; + latency->add_latency(elapsed); + if (counters) { + counters->tinc(sync_counters::l_lock, elapsed); + } } return set_state(RGWCoroutine_Done); } diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h index df40db5dbc46..1aa9f17e335f 100644 --- a/src/rgw/driver/rados/rgw_cr_rados.h +++ b/src/rgw/driver/rados/rgw_cr_rados.h @@ -18,6 +18,7 @@ #include "services/svc_sys_obj.h" #include "services/svc_bucket.h" +#include "include/common_fwd.h" struct rgw_http_param_pair; class RGWRESTConn; @@ -1563,17 +1564,20 @@ class RGWContinuousLeaseCR : public RGWCoroutine { ceph::coarse_mono_time current_time; LatencyMonitor* latency; + PerfCounters* counters; public: RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados, rgw::sal::RadosStore* _store, rgw_raw_obj obj, std::string lock_name, int interval, RGWCoroutine* caller, - LatencyMonitor* const latency) + LatencyMonitor* const latency, + PerfCounters* counters = nullptr) : RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store), obj(std::move(obj)), lock_name(std::move(lock_name)), interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)), - ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency) + ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency), + counters(counters) {} virtual ~RGWContinuousLeaseCR() override; diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index ff3d298b310f..269fea0e7a78 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -596,7 +596,7 @@ public: sc->env->async_rados, sc->env->driver, { sc->env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) }, - string(lock_name), lock_duration, caller, &sc->lcc); + string(lock_name), lock_duration, caller, &sc->lcc, sc->env->counters); } int operate(const DoutPrefixProvider *dpp) override { @@ -2337,7 +2337,7 @@ public: lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver, rgw_raw_obj(pool, status_oid), lock_name, lock_duration, this, - &sc->lcc)); + &sc->lcc, sync_env->counters)); lease_stack.reset(spawn(lease_cr.get(), false)); } }; @@ -5880,7 +5880,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) if (!no_lease && !bucket_lease_cr) { bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, - lock_name, lock_duration, this, &sc->lcc)); + lock_name, lock_duration, this, &sc->lcc, env->counters)); yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { @@ -5950,7 +5950,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) // so the command is never blocked by a background sync process holding the lock. if (!no_lease && !bucket_lease_cr) { bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, - lock_name, lock_duration, this, &sc->lcc)); + lock_name, lock_duration, this, &sc->lcc, env->counters)); yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { diff --git a/src/rgw/driver/rados/rgw_sync_counters.cc b/src/rgw/driver/rados/rgw_sync_counters.cc index 3aaed7b32f6d..37a3a1e318b1 100644 --- a/src/rgw/driver/rados/rgw_sync_counters.cc +++ b/src/rgw/driver/rados/rgw_sync_counters.cc @@ -21,6 +21,8 @@ PerfCountersRef build(CephContext *cct, const std::string& name) b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests"); b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors"); + b.add_time_avg(l_lock, "lock_latency", "Average latency of sync lock operations"); + auto logger = PerfCountersRef{ b.create_perf_counters(), cct }; cct->get_perfcounters_collection()->add(logger.get()); return logger; diff --git a/src/rgw/driver/rados/rgw_sync_counters.h b/src/rgw/driver/rados/rgw_sync_counters.h index 416aaeaf3928..0775d2063e0c 100644 --- a/src/rgw/driver/rados/rgw_sync_counters.h +++ b/src/rgw/driver/rados/rgw_sync_counters.h @@ -17,6 +17,8 @@ enum { l_poll, l_poll_err, + l_lock, + l_last, };