]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw/multisite: expose lock latency as perf counter for data sync
authorOguzhan Ozmen <oozmen@bloomberg.net>
Mon, 27 Apr 2026 23:07:03 +0000 (23:07 +0000)
committerOguzhan Ozmen <oozmen@bloomberg.net>
Wed, 29 Apr 2026 13:36:09 +0000 (13:36 +0000)
Add a "lock_latency" perf counter to the per-zone data sync counter.
This tracks the latency of RADOS lock/unlock operations in
RGWContinuousLeaseCR, giving operators visibility into the values
driving the LatencyConcurrencyControl.

The new perf counter can be queried via the admin socket:
  ceph daemon <asok> perf dump data-sync-from-<zone>
and reset independently:
  ceph daemon <asok> perf reset data-sync-from-<zone>

This would allow us to distinguish a poisoned average from ongoing
OSD latency issues without restarting the RGW process.

Signed-off-by: Oguzhan Ozmen <oozmen@bloomberg.net>
src/rgw/driver/rados/rgw_cr_rados.cc
src/rgw/driver/rados/rgw_cr_rados.h
src/rgw/driver/rados/rgw_data_sync.cc
src/rgw/driver/rados/rgw_sync_counters.cc
src/rgw/driver/rados/rgw_sync_counters.h

index fe44103a649dc4f7897b00da0f2ad3f088ff86c8..92ed99ab7e939e251b28ad68ea64a720dfe73ee8 100644 (file)
@@ -1035,7 +1035,11 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
       current_time = ceph::coarse_mono_clock::now();
       yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
       if (latency) {
-             latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+             auto elapsed = ceph::coarse_mono_clock::now() - current_time;
+             latency->add_latency(elapsed);
+             if (counters) {
+               counters->tinc(sync_counters::l_lock, elapsed);
+             }
       }
       current_time = ceph::coarse_mono_clock::now();
       if (current_time - last_renew_try_time > interval_tolerance) {
@@ -1059,7 +1063,11 @@ int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
     current_time = ceph::coarse_mono_clock::now();
     yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
     if (latency) {
-      latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+      auto elapsed = ceph::coarse_mono_clock::now() - current_time;
+      latency->add_latency(elapsed);
+      if (counters) {
+        counters->tinc(sync_counters::l_lock, elapsed);
+      }
     }
     return set_state(RGWCoroutine_Done);
   }
index df40db5dbc46ac2d6269fc1aed6fd59490a4ed7f..1aa9f17e335faa2914ce211db90ffe7e9ed113d8 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "services/svc_sys_obj.h"
 #include "services/svc_bucket.h"
+#include "include/common_fwd.h"
 
 struct rgw_http_param_pair;
 class RGWRESTConn;
@@ -1563,17 +1564,20 @@ class RGWContinuousLeaseCR : public RGWCoroutine {
   ceph::coarse_mono_time current_time;
 
   LatencyMonitor* latency;
+  PerfCounters* counters;
 
 public:
   RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados,
                        rgw::sal::RadosStore* _store,
                        rgw_raw_obj obj, std::string lock_name,
                        int interval, RGWCoroutine* caller,
-                      LatencyMonitor* const latency)
+                       LatencyMonitor* const latency,
+                       PerfCounters* counters = nullptr)
     : RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store),
       obj(std::move(obj)), lock_name(std::move(lock_name)),
       interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)),
-      ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency)
+      ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency),
+      counters(counters)
   {}
 
   virtual ~RGWContinuousLeaseCR() override;
index ff3d298b310f177bf936ad8f990c89393c8d0737..269fea0e7a788f242869b91a14b34623cbb698b6 100644 (file)
@@ -596,7 +596,7 @@ public:
       sc->env->async_rados, sc->env->driver,
       { sc->env->svc->zone->get_zone_params().log_pool,
        RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) },
-      string(lock_name), lock_duration, caller, &sc->lcc);
+      string(lock_name), lock_duration, caller, &sc->lcc, sc->env->counters);
   }
 
   int operate(const DoutPrefixProvider *dpp) override {
@@ -2337,7 +2337,7 @@ public:
     lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
                                             rgw_raw_obj(pool, status_oid),
                                             lock_name, lock_duration, this,
-                                           &sc->lcc));
+                                           &sc->lcc, sync_env->counters));
     lease_stack.reset(spawn(lease_cr.get(), false));
   }
 };
@@ -5880,7 +5880,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
 
         if (!no_lease && !bucket_lease_cr) {
           bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
-                lock_name, lock_duration, this, &sc->lcc));
+                lock_name, lock_duration, this, &sc->lcc, env->counters));
           yield spawn(bucket_lease_cr.get(), false);
           while (!bucket_lease_cr->is_locked()) {
             if (bucket_lease_cr->is_done()) {
@@ -5950,7 +5950,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
         // so the command is never blocked by a background sync process holding the lock.
         if (!no_lease && !bucket_lease_cr) {
           bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
-                                                        lock_name, lock_duration, this, &sc->lcc));
+                                                        lock_name, lock_duration, this, &sc->lcc, env->counters));
           yield spawn(bucket_lease_cr.get(), false);
           while (!bucket_lease_cr->is_locked()) {
             if (bucket_lease_cr->is_done()) {
index 3aaed7b32f6d4289b289bece0a631679794a443d..37a3a1e318b1b28216b81d32e68a5cf820e56e1d 100644 (file)
@@ -21,6 +21,8 @@ PerfCountersRef build(CephContext *cct, const std::string& name)
   b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
   b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
 
+  b.add_time_avg(l_lock, "lock_latency", "Average latency of sync lock operations");
+
   auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
   cct->get_perfcounters_collection()->add(logger.get());
   return logger;
index 416aaeaf392851e626f2526dac092a7da05e8fb8..0775d2063e0cc11148e0906564776192ddbe9759 100644 (file)
@@ -17,6 +17,8 @@ enum {
   l_poll,
   l_poll_err,
 
+  l_lock,
+
   l_last,
 };